From 705d14ec30d85a0a19d930cbbad02a63079d746f Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 25 Apr 2023 14:42:00 +0200 Subject: [PATCH 001/173] add window size parameter in ppanggolin context --- ppanggolin/context/searchGeneContext.py | 66 ++++++++++++++++++------- 1 file changed, 49 insertions(+), 17 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 75e2350c..e1b91282 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -6,6 +6,9 @@ import tempfile import time import logging +import os +from typing import List, Dict, Tuple + # installed libraries from tqdm import tqdm @@ -23,7 +26,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: str, sequences: str = None, families: str = None, transitive: int = 4, identity: float = 0.5, - coverage: float = 0.8, jaccard: float = 0.85, no_defrag: bool = False, + coverage: float = 0.8, jaccard: float = 0.85, window_size: int = 1, no_defrag: bool = False, cpu: int = 1, disable_bar=True): """ Main function to search common gene contexts between sequence set and pangenome families @@ -37,6 +40,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: :param identity: minimum identity threshold between sequences and gene families for the alignment :param coverage: minimum coverage threshold between sequences and gene families for the alignment :param jaccard: Jaccard index to filter edges in graph + :param window_size: Number of genes to consider in the gene context. :param no_defrag: do not use the defrag workflow if true :param cpu: Number of core used to process :param disable_bar: Allow preventing bar progress print @@ -45,7 +49,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: # check statuses and load info if sequences is not None and pangenome.status["geneFamilySequences"] not in ["inFile", "Loaded", "Computed"]: raise Exception("Cannot use this function as your pangenome does not have gene families representatives " - "associated to it. For now this works only if the clustering is realised by PPanGGOLiN.") + "associated to it. For now this works only if the clustering has been made by PPanGGOLiN.") check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) gene_families = {} @@ -56,9 +60,12 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: seq_set, _, seq2pan = get_seq2pang(pangenome, sequences, output, new_tmpdir, cpu, no_defrag, identity, coverage) project_partition(seq2pan, seq_set, output) + new_tmpdir.cleanup() - for k, v in seq2pan.items(): - gene_families[v.name] = v + + for pan_family in seq2pan.values(): + gene_families[pan_family.name] = pan_family + fam_2_seq = fam2seq(seq2pan) if families is not None: @@ -66,16 +73,20 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: for fam_name in f.read().splitlines(): gene_families[fam_name] = pangenome.get_gene_family(fam_name) + half_window = round((window_size-1)/2) + logging.info(f'Window size of {half_window*2 + 1}. Gene context will include {half_window} genes on each side of the target gene.') + # Compute the graph with transitive closure size provided as parameter start_time = time.time() logging.getLogger().info("Building the graph...") - g = compute_gene_context_graph(families=gene_families, t=transitive, disable_bar=disable_bar) + gene_context_graph = compute_gene_context_graph(families=gene_families, t=transitive, half_window=half_window, disable_bar=disable_bar) logging.getLogger().info( f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find common gene contexts") - logging.getLogger().debug(f"There are {nx.number_of_nodes(g)} nodes and {nx.number_of_edges(g)} edges") - + logging.getLogger().debug(f"There are {nx.number_of_nodes(gene_context_graph)} nodes and {nx.number_of_edges(gene_context_graph)} edges") + + # extract the modules from the graph - common_components = compute_gene_context(g, jaccard) + common_components = compute_gene_context(gene_context_graph, jaccard) families = set() for gene_context in common_components: @@ -89,12 +100,19 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: logging.getLogger().info(f"Computing gene contexts took {round(time.time() - start_time, 2)} seconds") -def compute_gene_context_graph(families: dict, t: int = 4, disable_bar: bool = False) -> nx.Graph: + + # for e, data in gene_context_graph(data=True): + + # nx.write_graphml_lxml(gene_context_graph, os.path.join(output, "context.graphml")) + + +def compute_gene_context_graph(families: dict, t: int = 4, half_window: int = 0, disable_bar: bool = False) -> nx.Graph: """ Construct the graph of gene contexts between families of the pan :param families: Gene families of interest :param t: transitive value + :param half_window: An integer specifying the number of genes to include in the context on each side of the gene of interest. :param disable_bar: Prevents progress bar printing :return: Graph of gene contexts between interesting gene families of the pan @@ -104,7 +122,7 @@ def compute_gene_context_graph(families: dict, t: int = 4, disable_bar: bool = F for family in tqdm(families.values(), unit="families", disable=disable_bar): for gene in family.genes: contig = gene.contig.genes - pos_left, in_context_left, pos_right, in_context_right = extract_gene_context(gene, contig, families, t) + pos_left, in_context_left, pos_right, in_context_right = extract_gene_context(gene, contig, families, t, half_window) if in_context_left or in_context_right: for env_gene in contig[pos_left:pos_right + 1]: _compute_gene_context_graph(g, env_gene, contig, pos_right) @@ -133,29 +151,40 @@ def _compute_gene_context_graph(g: nx.Graph, env_gene: Gene, contig: Contig, pos pos += 1 -def extract_gene_context(gene: Gene, contig: list, families: dict, t: int = 4) -> (int, bool, int, bool): + +def extract_gene_context(gene: Gene, contig: List[Gene], families: Dict[str, str], t: int = 4, half_window: int = 0) -> Tuple[int, bool, int, bool]: """ - Extract gene context and whether said gene context exists + Determine the left and rigth position of the gene context and whether said gene context exists. :param gene: Gene of interest :param contig: list of genes in contig :param families: Alignment results :param t: transitive value + :param half_window: An integer specifying the number of genes to include in the context on each side of the gene of interest. :return: Position of the context and if it exists for each side ('left' and 'right') """ - pos_left, pos_right = (max(0, gene.position - t), - min(gene.position + t, len(contig) - 1)) # Gene positions to compare family + search_window = max(t, half_window) + + pos_left, pos_right = (max(0, gene.position - search_window), + min(gene.position + search_window, len(contig) - 1)) # Gene positions to compare family + in_context_left, in_context_right = (False, False) while pos_left < gene.position and not in_context_left: - if contig[pos_left].family in families.values(): + if gene.position - pos_left <= half_window: + # position is in the window + in_context_left = True + + elif contig[pos_left].family in families.values(): in_context_left = True else: pos_left += 1 while pos_right > gene.position and not in_context_right: - if contig[pos_right].family in families.values(): + if pos_right - gene.position <= half_window: + in_context_right = True + elif contig[pos_right].family in families.values(): in_context_right = True else: pos_right -= 1 @@ -243,7 +272,7 @@ def launch(args: argparse.Namespace): pangenome.add_file(args.pangenome) search_gene_context_in_pangenome(pangenome=pangenome, output=args.output, tmpdir=args.tmpdir, sequences=args.sequences, families=args.family, transitive=args.transitive, - identity=args.identity, coverage=args.coverage, jaccard=args.jaccard, + identity=args.identity, coverage=args.coverage, jaccard=args.jaccard, window_size=args.window_size, no_defrag=args.no_defrag, cpu=args.cpu, disable_bar=args.disable_prog_bar) @@ -291,6 +320,9 @@ def parser_context(parser: argparse.ArgumentParser): help="Size of the transitive closure used to build the graph. This indicates the number of " "non related genes allowed in-between two related genes. Increasing it will improve " "precision but lower sensitivity a little.") + optional.add_argument("-w", "--window_size", required=False, type=int, default=1, + help="Number of genes adjacent to a gene of interest to consider in the gene context even if they are non related genes.") + optional.add_argument("-s", "--jaccard", required=False, type=restricted_float, default=0.85, help="minimum jaccard similarity used to filter edges between gene families. Increasing it " "will improve precision but lower sensitivity a lot.") From 1ab7f5d124cd887b6e2eabffa8a3c475ea6a51c5 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 3 May 2023 18:03:07 +0200 Subject: [PATCH 002/173] Add new fct extract_contig_window and tests --- ppanggolin/context/searchGeneContext.py | 37 +++++++++++++++++++++++-- tests/context/test_context.py | 27 ++++++++++++++++++ 2 files changed, 61 insertions(+), 3 deletions(-) create mode 100644 tests/context/test_context.py diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index e1b91282..1c4bdc62 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -7,8 +7,8 @@ import time import logging import os -from typing import List, Dict, Tuple - +from typing import List, Dict, Tuple, Iterable +from itertools import zip_longest # installed libraries from tqdm import tqdm @@ -106,6 +106,37 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: # nx.write_graphml_lxml(gene_context_graph, os.path.join(output, "context.graphml")) + +def extract_contig_window(contig_length: int, positions_of_interest: Iterable[int], window_size: int): + """ + Extracts contiguous windows around positions of interest within a contig. + + :param contig_length: The length of the contig. + :param positions_of_interest: An iterable containing the positions of interest. + :param window_size: The size of the window to extract around each position of interest. + :return: Yields tuples representing the start and end positions of each contiguous window. + """ + + sorted_positions = sorted(positions_of_interest) + + if sorted_positions[0] <0 or sorted_positions[-1] >= contig_length: + raise IndexError(f'Positions of interest are out of range. ' + f"Contig length is {contig_length} while given min={sorted_positions[0]} & max={sorted_positions[-1]} positions") + + start_po = max(sorted_positions[0] - window_size, 0) + + for position, next_po in zip_longest(sorted_positions, sorted_positions[1:]): + if next_po is None: + end_po = min(position + window_size, contig_length-1) + yield start_po, end_po + break + + if position + window_size < next_po - window_size: + end_po = min(position + window_size, contig_length-1) + yield start_po, end_po + start_po = max(next_po - window_size, 0) + + def compute_gene_context_graph(families: dict, t: int = 4, half_window: int = 0, disable_bar: bool = False) -> nx.Graph: """ Construct the graph of gene contexts between families of the pan @@ -321,7 +352,7 @@ def parser_context(parser: argparse.ArgumentParser): "non related genes allowed in-between two related genes. Increasing it will improve " "precision but lower sensitivity a little.") optional.add_argument("-w", "--window_size", required=False, type=int, default=1, - help="Number of genes adjacent to a gene of interest to consider in the gene context even if they are non related genes.") + help="Number of neighboring genes that are considered when searching for conserved genomic contexts around a gene of interest.") optional.add_argument("-s", "--jaccard", required=False, type=restricted_float, default=0.85, help="minimum jaccard similarity used to filter edges between gene families. Increasing it " diff --git a/tests/context/test_context.py b/tests/context/test_context.py new file mode 100644 index 00000000..a62b51dd --- /dev/null +++ b/tests/context/test_context.py @@ -0,0 +1,27 @@ +import pytest +from ppanggolin.context.searchGeneContext import extract_contig_window + + +def test_extract_contig_window(): + assert list(extract_contig_window(contig_length=15, positions_of_interest={8}, window_size=1)) == [(7,9)] + + # check that extracted window is inside contig limit + assert list(extract_contig_window(contig_length=16, positions_of_interest={15}, window_size=4)) == [(11,15)] + + assert list(extract_contig_window(contig_length=10, positions_of_interest={2, 8}, window_size=2)) == [(0,4), (6,9)] + + assert list(extract_contig_window(contig_length=10, positions_of_interest={2, 5, 8}, window_size=2)) == [(0,9)] + + # # check that circularity is properly taken into account + # assert list(extract_contig_window(contig_length=11, positions_of_interest={0}, window_size=2, is_circular=True)) == [(9,2)] + + # assert list(extract_contig_window(contig_length=11, positions_of_interest={0, 9}, window_size=2, is_circular=True)) == [(7,2)] + + # assert list(extract_contig_window(contig_length=11, positions_of_interest={0, 9}, window_size=2, is_circular=False)) == [(0,2), (7,11)] + + + with pytest.raises(IndexError): + list(extract_contig_window(contig_length=15, positions_of_interest={15}, window_size=1)) + + with pytest.raises(IndexError): + list(extract_contig_window(contig_length=15, positions_of_interest={-1}, window_size=1)) \ No newline at end of file From 3ee7def3190d3d612574cd4cdca7bde5fe53f344 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 4 May 2023 13:20:38 +0200 Subject: [PATCH 003/173] add possibility to process circular contig --- ppanggolin/context/searchGeneContext.py | 37 +++++++++++++++++++++---- tests/context/test_context.py | 32 ++++++++++++++------- 2 files changed, 53 insertions(+), 16 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 1c4bdc62..762c6137 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -106,35 +106,60 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: # nx.write_graphml_lxml(gene_context_graph, os.path.join(output, "context.graphml")) - -def extract_contig_window(contig_length: int, positions_of_interest: Iterable[int], window_size: int): +def extract_contig_window(contig_length: int, positions_of_interest: Iterable[int], window_size: int, is_circular:bool = False): """ Extracts contiguous windows around positions of interest within a contig. :param contig_length: The length of the contig. :param positions_of_interest: An iterable containing the positions of interest. :param window_size: The size of the window to extract around each position of interest. + :param is_circular: Indicates if the contig is circular. :return: Yields tuples representing the start and end positions of each contiguous window. """ + windows_coordinates = [] + # Sort the positions of interest sorted_positions = sorted(positions_of_interest) + # Check if any position of interest is out of range if sorted_positions[0] <0 or sorted_positions[-1] >= contig_length: raise IndexError(f'Positions of interest are out of range. ' f"Contig length is {contig_length} while given min={sorted_positions[0]} & max={sorted_positions[-1]} positions") + first_position = sorted_positions[0] + last_position = sorted_positions[-1] + if is_circular: + # in a circular contig, if the window of a gene of interest overlaps the end/start of the contig + # an out of scope position is added to the sorted positions to take into account those positions + # the returned window are always checked that its positions are not out of range... + # so there's no chance to find an out of scope position in final list + if first_position - window_size < 0: + out_of_scope_position = (contig_length ) + first_position + sorted_positions.append(out_of_scope_position) + + if last_position + window_size >= contig_length : + out_of_scope_position = contig_length-1 - (last_position + window_size) + sorted_positions.insert(0, out_of_scope_position) + start_po = max(sorted_positions[0] - window_size, 0) for position, next_po in zip_longest(sorted_positions, sorted_positions[1:]): + if next_po is None: + # If there are no more positions, add the final window end_po = min(position + window_size, contig_length-1) - yield start_po, end_po - break + windows_coordinates.append((start_po, end_po)) - if position + window_size < next_po - window_size: + elif position + window_size +1 < next_po - window_size: + # If there is a gap between positions, add the current window + # and update the start position for the next window end_po = min(position + window_size, contig_length-1) - yield start_po, end_po + + windows_coordinates.append((start_po, end_po)) + start_po = max(next_po - window_size, 0) + + return windows_coordinates def compute_gene_context_graph(families: dict, t: int = 4, half_window: int = 0, disable_bar: bool = False) -> nx.Graph: diff --git a/tests/context/test_context.py b/tests/context/test_context.py index a62b51dd..8597444d 100644 --- a/tests/context/test_context.py +++ b/tests/context/test_context.py @@ -3,25 +3,37 @@ def test_extract_contig_window(): - assert list(extract_contig_window(contig_length=15, positions_of_interest={8}, window_size=1)) == [(7,9)] + assert extract_contig_window(contig_length=15, positions_of_interest={8}, window_size=1) == [(7,9)] # check that extracted window is inside contig limit - assert list(extract_contig_window(contig_length=16, positions_of_interest={15}, window_size=4)) == [(11,15)] + assert extract_contig_window(contig_length=16, positions_of_interest={15}, window_size=4) == [(11,15)] - assert list(extract_contig_window(contig_length=10, positions_of_interest={2, 8}, window_size=2)) == [(0,4), (6,9)] + assert extract_contig_window(contig_length=10, positions_of_interest={2, 8}, window_size=2) == [(0,4), (6,9)] + + # 12 window is (9,15) + # 19 window is (16,22) + # so when 12 and 19 are of interest window merge (9,22) + assert extract_contig_window(contig_length=200, positions_of_interest={12}, window_size=3) == [(9,15)] + assert extract_contig_window(contig_length=200, positions_of_interest={19}, window_size=3) == [(16,22)] + assert extract_contig_window(contig_length=200, positions_of_interest={12, 19}, window_size=3) == [(9,22)] - assert list(extract_contig_window(contig_length=10, positions_of_interest={2, 5, 8}, window_size=2)) == [(0,9)] + assert extract_contig_window(contig_length=10, positions_of_interest={2, 5, 8}, window_size=2) == [(0,9)] +def test_extract_contig_window_with_circular_contig(): # # check that circularity is properly taken into account - # assert list(extract_contig_window(contig_length=11, positions_of_interest={0}, window_size=2, is_circular=True)) == [(9,2)] + assert extract_contig_window(contig_length=12, positions_of_interest={1}, window_size=2, is_circular=True) == [(0,3), (11,11)] + assert extract_contig_window(contig_length=12, positions_of_interest={1}, window_size=3, is_circular=True) == [(0,4), (10,11)] + assert extract_contig_window(contig_length=12, positions_of_interest={10}, window_size=3, is_circular=True) == [(0,1), (7,11)] - # assert list(extract_contig_window(contig_length=11, positions_of_interest={0, 9}, window_size=2, is_circular=True)) == [(7,2)] - - # assert list(extract_contig_window(contig_length=11, positions_of_interest={0, 9}, window_size=2, is_circular=False)) == [(0,2), (7,11)] + assert list(extract_contig_window(contig_length=12, positions_of_interest={6}, window_size=6, is_circular=True)) == [(0,11)] + assert list(extract_contig_window(contig_length=12, positions_of_interest={1}, window_size=6, is_circular=True)) == [(0,11)] + assert list(extract_contig_window(contig_length=12, positions_of_interest={1}, window_size=6, is_circular=False)) == [(0,7)] + assert list(extract_contig_window(contig_length=12, positions_of_interest={0, 9}, window_size=2, is_circular=False)) == [(0,2), (7,11)] +def test_extract_contig_window_out_of_range(): with pytest.raises(IndexError): - list(extract_contig_window(contig_length=15, positions_of_interest={15}, window_size=1)) + extract_contig_window(contig_length=15, positions_of_interest={15}, window_size=1) with pytest.raises(IndexError): - list(extract_contig_window(contig_length=15, positions_of_interest={-1}, window_size=1)) \ No newline at end of file + extract_contig_window(contig_length=15, positions_of_interest={-1}, window_size=1) \ No newline at end of file From 2c2e7818402c85e30052da851cce275217825b50 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 4 May 2023 17:39:46 +0200 Subject: [PATCH 004/173] fix wrong limit computation in extract_contig_window --- ppanggolin/context/searchGeneContext.py | 29 ++++++++++--------- tests/context/test_context.py | 38 +++++++++++-------------- 2 files changed, 33 insertions(+), 34 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 762c6137..6755bb35 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -8,7 +8,7 @@ import logging import os from typing import List, Dict, Tuple, Iterable -from itertools import zip_longest +from itertools import zip_longest, chain # installed libraries from tqdm import tqdm @@ -52,8 +52,10 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: "associated to it. For now this works only if the clustering has been made by PPanGGOLiN.") check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) + gene_families = {} fam_2_seq = None + if sequences is not None: # Alignment of sequences on pangenome families new_tmpdir = tempfile.TemporaryDirectory(dir=tmpdir) @@ -106,11 +108,12 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: # nx.write_graphml_lxml(gene_context_graph, os.path.join(output, "context.graphml")) -def extract_contig_window(contig_length: int, positions_of_interest: Iterable[int], window_size: int, is_circular:bool = False): + +def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int], window_size: int, is_circular:bool = False): """ Extracts contiguous windows around positions of interest within a contig. - :param contig_length: The length of the contig. + :param contig_size: Number of genes in contig. :param positions_of_interest: An iterable containing the positions of interest. :param window_size: The size of the window to extract around each position of interest. :param is_circular: Indicates if the contig is circular. @@ -122,23 +125,23 @@ def extract_contig_window(contig_length: int, positions_of_interest: Iterable[in sorted_positions = sorted(positions_of_interest) # Check if any position of interest is out of range - if sorted_positions[0] <0 or sorted_positions[-1] >= contig_length: + if sorted_positions[0] <0 or sorted_positions[-1] >= contig_size: raise IndexError(f'Positions of interest are out of range. ' - f"Contig length is {contig_length} while given min={sorted_positions[0]} & max={sorted_positions[-1]} positions") - - first_position = sorted_positions[0] - last_position = sorted_positions[-1] + f"Contig has {contig_size} genes while given min={sorted_positions[0]} & max={sorted_positions[-1]} positions") + if is_circular: + first_position = sorted_positions[0] + last_position = sorted_positions[-1] # in a circular contig, if the window of a gene of interest overlaps the end/start of the contig # an out of scope position is added to the sorted positions to take into account those positions # the returned window are always checked that its positions are not out of range... # so there's no chance to find an out of scope position in final list if first_position - window_size < 0: - out_of_scope_position = (contig_length ) + first_position + out_of_scope_position = (contig_size ) + first_position sorted_positions.append(out_of_scope_position) - if last_position + window_size >= contig_length : - out_of_scope_position = contig_length-1 - (last_position + window_size) + if last_position + window_size >= contig_size : + out_of_scope_position = last_position - contig_size sorted_positions.insert(0, out_of_scope_position) start_po = max(sorted_positions[0] - window_size, 0) @@ -147,13 +150,13 @@ def extract_contig_window(contig_length: int, positions_of_interest: Iterable[in if next_po is None: # If there are no more positions, add the final window - end_po = min(position + window_size, contig_length-1) + end_po = min(position + window_size, contig_size-1) windows_coordinates.append((start_po, end_po)) elif position + window_size +1 < next_po - window_size: # If there is a gap between positions, add the current window # and update the start position for the next window - end_po = min(position + window_size, contig_length-1) + end_po = min(position + window_size, contig_size-1) windows_coordinates.append((start_po, end_po)) diff --git a/tests/context/test_context.py b/tests/context/test_context.py index 8597444d..7e5fff07 100644 --- a/tests/context/test_context.py +++ b/tests/context/test_context.py @@ -1,39 +1,35 @@ import pytest -from ppanggolin.context.searchGeneContext import extract_contig_window +from ppanggolin.context.searchGeneContext import extract_contig_window, get_n_next_genes_index + def test_extract_contig_window(): - assert extract_contig_window(contig_length=15, positions_of_interest={8}, window_size=1) == [(7,9)] + assert extract_contig_window(contig_size=15, positions_of_interest={8}, window_size=1) == [(7,9)] # check that extracted window is inside contig limit - assert extract_contig_window(contig_length=16, positions_of_interest={15}, window_size=4) == [(11,15)] + assert extract_contig_window(contig_size=16, positions_of_interest={15}, window_size=4) == [(11,15)] - assert extract_contig_window(contig_length=10, positions_of_interest={2, 8}, window_size=2) == [(0,4), (6,9)] + assert extract_contig_window(contig_size=10, positions_of_interest={2, 8}, window_size=2) == [(0,4), (6,9)] # 12 window is (9,15) # 19 window is (16,22) # so when 12 and 19 are of interest window merge (9,22) - assert extract_contig_window(contig_length=200, positions_of_interest={12}, window_size=3) == [(9,15)] - assert extract_contig_window(contig_length=200, positions_of_interest={19}, window_size=3) == [(16,22)] - assert extract_contig_window(contig_length=200, positions_of_interest={12, 19}, window_size=3) == [(9,22)] + assert extract_contig_window(contig_size=200, positions_of_interest={12}, window_size=3) == [(9,15)] + assert extract_contig_window(contig_size=200, positions_of_interest={19}, window_size=3) == [(16,22)] + assert extract_contig_window(contig_size=200, positions_of_interest={12, 19}, window_size=3) == [(9,22)] - assert extract_contig_window(contig_length=10, positions_of_interest={2, 5, 8}, window_size=2) == [(0,9)] + assert extract_contig_window(contig_size=10, positions_of_interest={2, 5, 8}, window_size=2) == [(0,9)] def test_extract_contig_window_with_circular_contig(): # # check that circularity is properly taken into account - assert extract_contig_window(contig_length=12, positions_of_interest={1}, window_size=2, is_circular=True) == [(0,3), (11,11)] - assert extract_contig_window(contig_length=12, positions_of_interest={1}, window_size=3, is_circular=True) == [(0,4), (10,11)] - assert extract_contig_window(contig_length=12, positions_of_interest={10}, window_size=3, is_circular=True) == [(0,1), (7,11)] - - assert list(extract_contig_window(contig_length=12, positions_of_interest={6}, window_size=6, is_circular=True)) == [(0,11)] - assert list(extract_contig_window(contig_length=12, positions_of_interest={1}, window_size=6, is_circular=True)) == [(0,11)] - assert list(extract_contig_window(contig_length=12, positions_of_interest={1}, window_size=6, is_circular=False)) == [(0,7)] + assert extract_contig_window(contig_size=12, positions_of_interest={1}, window_size=2, is_circular=True) == [(0,3), (11,11)] + assert extract_contig_window(contig_size=12, positions_of_interest={1}, window_size=3, is_circular=True) == [(0,4), (10,11)] + assert extract_contig_window(contig_size=12, positions_of_interest={10}, window_size=3, is_circular=True) == [(0,1), (7,11)] - assert list(extract_contig_window(contig_length=12, positions_of_interest={0, 9}, window_size=2, is_circular=False)) == [(0,2), (7,11)] + assert extract_contig_window(contig_size=12, positions_of_interest={6}, window_size=6, is_circular=True) == [(0,11)] + assert extract_contig_window(contig_size=12, positions_of_interest={1}, window_size=6, is_circular=True) == [(0,11)] + assert extract_contig_window(contig_size=12, positions_of_interest={1}, window_size=6, is_circular=False) == [(0,7)] -def test_extract_contig_window_out_of_range(): - with pytest.raises(IndexError): - extract_contig_window(contig_length=15, positions_of_interest={15}, window_size=1) + assert extract_contig_window(contig_size=12, positions_of_interest={0, 9}, window_size=2, is_circular=False) == [(0,2), (7,11)] - with pytest.raises(IndexError): - extract_contig_window(contig_length=15, positions_of_interest={-1}, window_size=1) \ No newline at end of file + assert extract_contig_window(contig_size=894, positions_of_interest=[151, 152, 153, 893], window_size=4, is_circular=True) == [(0, 3), (147, 157), (889, 893)] From f3a55f2fedce13185127b7d4946a4a0f7e452d37 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 4 May 2023 17:43:11 +0200 Subject: [PATCH 005/173] add fct and test to get next gene index in a circular aware fashion --- ppanggolin/context/searchGeneContext.py | 15 ++++++++++++++- tests/context/test_context.py | 22 ++++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 6755bb35..d1cbd041 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -107,7 +107,20 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: # nx.write_graphml_lxml(gene_context_graph, os.path.join(output, "context.graphml")) - +def get_n_next_genes_index(current_index:int, next_genes_count:int, contig_size:int, is_circular:bool = False): + # Check if any position of interest is out of range + if current_index >= contig_size: + raise IndexError(f'current gene index is out of range. ' + f"Contig has {contig_size} genes while the given gene index is {current_index}") + if is_circular: + next_genes = chain(range(current_index+1, contig_size), range(0, current_index)) + else: + next_genes = range(current_index+1, contig_size) + + for i, next_gene_index in enumerate(next_genes): + if i == next_genes_count: + break + yield next_gene_index def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int], window_size: int, is_circular:bool = False): """ diff --git a/tests/context/test_context.py b/tests/context/test_context.py index 7e5fff07..cf6c5fb1 100644 --- a/tests/context/test_context.py +++ b/tests/context/test_context.py @@ -33,3 +33,25 @@ def test_extract_contig_window_with_circular_contig(): assert extract_contig_window(contig_size=12, positions_of_interest={0, 9}, window_size=2, is_circular=False) == [(0,2), (7,11)] assert extract_contig_window(contig_size=894, positions_of_interest=[151, 152, 153, 893], window_size=4, is_circular=True) == [(0, 3), (147, 157), (889, 893)] + +def test_extract_contig_window_out_of_range(): + with pytest.raises(IndexError): + extract_contig_window(contig_size=15, positions_of_interest={15}, window_size=1) + + with pytest.raises(IndexError): + extract_contig_window(contig_size=15, positions_of_interest={-1}, window_size=1) + +def test_get_n_next_genes_index(): + + assert list(get_n_next_genes_index(current_index=6, next_genes_count=3, contig_size=100, is_circular=False)) == [7, 8, 9] + + # there is no next gene because the current index is at the end of a non cicurclar contig + assert list(get_n_next_genes_index(current_index=11, next_genes_count=2, contig_size=12, is_circular=False)) == [] + +def test_get_n_next_genes_index_circular(): + assert list(get_n_next_genes_index(current_index=10, next_genes_count=3, contig_size=12, is_circular=True)) == [11, 0, 1] + assert list(get_n_next_genes_index(current_index=10, next_genes_count=16, contig_size=12, is_circular=True)) == [11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + +def test_get_n_next_genes_index_out_of_range(): + with pytest.raises(IndexError): + assert list(get_n_next_genes_index(current_index=10, next_genes_count=16, contig_size=8, is_circular=False)) From cdcc4e1da5245cd225442a6aa99b7af1e182fcd5 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 4 May 2023 19:47:54 +0200 Subject: [PATCH 006/173] add fct add_edges_to_context_graph with simple test --- ppanggolin/context/searchGeneContext.py | 48 +++++++++++++++++++++++++ tests/context/test_context.py | 47 +++++++++++++++++++++++- 2 files changed, 94 insertions(+), 1 deletion(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index d1cbd041..b645e993 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -106,7 +106,55 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: # for e, data in gene_context_graph(data=True): # nx.write_graphml_lxml(gene_context_graph, os.path.join(output, "context.graphml")) + +def add_edges_to_context_graph(context_graph: nx.Graph, + contig_genes: Iterable[Gene], + contig_windows: List[Tuple[int, int]], + t: int, + is_circular: bool, + disable_bar: bool = False): + print('WINDOWS to search contig in ', contig_windows) + for window_start, window_end in contig_windows: + print('IN WINDOW ', window_start, window_end ) + for gene_index in range(window_start, window_end +1): + print("- CURRENT GENE INDEX", gene_index) + gene = contig_genes[gene_index] + next_genes = get_n_next_genes_index(gene_index, next_genes_count=t, + contig_size=len(contig_genes), is_circular=is_circular) + next_genes = list(next_genes) + print('- next gene index:', next_genes) + + for next_gene_index in next_genes: + print('-- current next_gene_index', next_gene_index) + # check that next gene is in contig windows.. + if not any(lower <= next_gene_index <= upper for (lower, upper) in contig_windows): + # next_gene_index is not in any range of genes in the context + # so it is ignore and all folowing genes as well + break + + next_gene = contig_genes[next_gene_index] + if next_gene.family == gene.family: + # if next gene has the same family, the two gene refer to the same node + # so they are ignored.. + print('gene and next gene families are identical.. continue ') + continue + + context_graph.add_edge(gene.family, next_gene.family) + print(f"-- ADD edge between families", gene.family.name, next_gene.family.name) + + try: + context_graph[gene.family][next_gene.family][gene.family].add(gene) + except KeyError: + context_graph[gene.family][next_gene.family][gene.family] = {gene} + try: + context_graph[gene.family][next_gene.family][next_gene.family].add(next_gene) + except KeyError: + context_graph[gene.family][next_gene.family][next_gene.family] = {next_gene} + + + + def get_n_next_genes_index(current_index:int, next_genes_count:int, contig_size:int, is_circular:bool = False): # Check if any position of interest is out of range if current_index >= contig_size: diff --git a/tests/context/test_context.py b/tests/context/test_context.py index cf6c5fb1..fdd435af 100644 --- a/tests/context/test_context.py +++ b/tests/context/test_context.py @@ -1,6 +1,10 @@ import pytest -from ppanggolin.context.searchGeneContext import extract_contig_window, get_n_next_genes_index +from ppanggolin.context.searchGeneContext import extract_contig_window, get_n_next_genes_index, add_edges_to_context_graph +from ppanggolin.geneFamily import GeneFamily +from ppanggolin.genome import Gene, Contig + +import networkx as nx def test_extract_contig_window(): @@ -55,3 +59,44 @@ def test_get_n_next_genes_index_circular(): def test_get_n_next_genes_index_out_of_range(): with pytest.raises(IndexError): assert list(get_n_next_genes_index(current_index=10, next_genes_count=16, contig_size=8, is_circular=False)) + +@pytest.fixture() +def simple_contig(): + + contig = Contig(name="contig1", is_circular=False) + + contig_size=6 + genes = [Gene(i) for i in range(contig_size)] + + for i, (gene, family_name) in enumerate(zip(genes, 'ABCDEFGHIJKLMNOP')): + family = GeneFamily(i, family_name) + gene.fill_annotations(start=0, stop=0, strand=0, position=i) + + contig.add_gene(gene) + family.add_gene(gene) + + return contig + + + +def test_add_edges_to_context_graph(simple_contig): + context_graph = nx.Graph() + + #simple_contig families : ABCDEF + + add_edges_to_context_graph(context_graph, + contig_genes = simple_contig.genes, + contig_windows = [(0,3)], + t=2, + is_circular=simple_contig.is_circular) + + nodes = sorted([n.name for n in context_graph.nodes()]) + edges = {tuple(sorted([n.name, v.name])) for n, v in context_graph.edges()} + + assert nodes == ['A', "B", "C", "D"] + assert edges == {('A', 'B'), + ('A', 'C'), + ('B', 'C'), + ('B', 'D'), + ('C', 'D')} + \ No newline at end of file From d6131ca1cb98df927675907aa4af19adc9da0d19 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 5 May 2023 10:57:43 +0200 Subject: [PATCH 007/173] add test and clean fct add_edges_to_context_graph --- ppanggolin/context/searchGeneContext.py | 89 ++++++++++++++++--------- tests/context/test_context.py | 83 +++++++++++++++++++++++ 2 files changed, 141 insertions(+), 31 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index b645e993..a98b1062 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -7,7 +7,7 @@ import time import logging import os -from typing import List, Dict, Tuple, Iterable +from typing import List, Dict, Tuple, Iterable, Hashable from itertools import zip_longest, chain # installed libraries @@ -102,57 +102,84 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: logging.getLogger().info(f"Computing gene contexts took {round(time.time() - start_time, 2)} seconds") - - # for e, data in gene_context_graph(data=True): - - # nx.write_graphml_lxml(gene_context_graph, os.path.join(output, "context.graphml")) - def add_edges_to_context_graph(context_graph: nx.Graph, contig_genes: Iterable[Gene], contig_windows: List[Tuple[int, int]], t: int, - is_circular: bool, - disable_bar: bool = False): - - print('WINDOWS to search contig in ', contig_windows) + is_circular: bool): + """ + Add edges to the context graph based on contig genes and windows. + + :param context_graph: The context graph to which edges will be added. + :param contig_genes: An iterable of genes in the contig. + :param contig_windows: A list of tuples representing the start and end positions of contig windows. + :param t: The number of next genes to consider when adding edges. + :param is_circular: A boolean indicating if the contig is circular. + + """ for window_start, window_end in contig_windows: - print('IN WINDOW ', window_start, window_end ) - for gene_index in range(window_start, window_end +1): - print("- CURRENT GENE INDEX", gene_index) + for gene_index in range(window_start, window_end + 1): gene = contig_genes[gene_index] next_genes = get_n_next_genes_index(gene_index, next_genes_count=t, - contig_size=len(contig_genes), is_circular=is_circular) + contig_size=len(contig_genes), is_circular=is_circular) next_genes = list(next_genes) - print('- next gene index:', next_genes) - + for next_gene_index in next_genes: - print('-- current next_gene_index', next_gene_index) - # check that next gene is in contig windows.. + # Check if the next gene is within the contig windows if not any(lower <= next_gene_index <= upper for (lower, upper) in contig_windows): # next_gene_index is not in any range of genes in the context - # so it is ignore and all folowing genes as well + # so it is ignored along with all following genes break next_gene = contig_genes[next_gene_index] if next_gene.family == gene.family: - # if next gene has the same family, the two gene refer to the same node - # so they are ignored.. - print('gene and next gene families are identical.. continue ') + # If the next gene has the same family, the two genes refer to the same node + # so they are ignored continue context_graph.add_edge(gene.family, next_gene.family) - print(f"-- ADD edge between families", gene.family.name, next_gene.family.name) + + # Add edge attributes + edge_dict = context_graph[gene.family][next_gene.family] + add_val_to_edge_attribute(edge_dict, gene.family, gene) + add_val_to_edge_attribute(edge_dict, next_gene.family, next_gene) + + add_val_to_edge_attribute(edge_dict, "organisms", gene.organism) + + update_edge_attribute_counter(edge_dict, "gene_pairs") - try: - context_graph[gene.family][next_gene.family][gene.family].add(gene) - except KeyError: - context_graph[gene.family][next_gene.family][gene.family] = {gene} - try: - context_graph[gene.family][next_gene.family][next_gene.family].add(next_gene) - except KeyError: - context_graph[gene.family][next_gene.family][next_gene.family] = {next_gene} + assert gene.organism == next_gene.organism +def add_val_to_edge_attribute(edge_dict: dict, attribute_key, attribute_value): + """ + Add an edge attribute value to the edge dictionary set. + + :param edge_dict: The dictionary containing the edge attributes. + :param attribute_key: The key of the attribute. + :param attribute_value: The value of the attribute to be added. + + """ + + try: + edge_dict[attribute_key].add(attribute_value) + except KeyError: + edge_dict[attribute_key] = {attribute_value} + + +def update_edge_attribute_counter(edge_dict: dict, key:Hashable): + """ + Update the counter for an edge attribute in the edge dictionary. + + :param edge_dict: The dictionary containing the edge attributes. + :param key: The key of the attribute. + + """ + + try: + edge_dict[key] += 1 + except KeyError: + edge_dict[key] = 1 def get_n_next_genes_index(current_index:int, next_genes_count:int, contig_size:int, is_circular:bool = False): diff --git a/tests/context/test_context.py b/tests/context/test_context.py index fdd435af..264cbf8f 100644 --- a/tests/context/test_context.py +++ b/tests/context/test_context.py @@ -77,6 +77,23 @@ def simple_contig(): return contig +@pytest.fixture() +def simple_circular_contig(): + + contig = Contig(name="contig2", is_circular=True) + + contig_size=6 + genes = [Gene(i) for i in range(contig_size)] + + for i, (gene, family_name) in enumerate(zip(genes, 'ABCDEFGHIJKLMNOP')): + family = GeneFamily(i, family_name) + gene.fill_annotations(start=0, stop=0, strand=0, position=i) + + contig.add_gene(gene) + family.add_gene(gene) + + return contig + def test_add_edges_to_context_graph(simple_contig): @@ -99,4 +116,70 @@ def test_add_edges_to_context_graph(simple_contig): ('B', 'C'), ('B', 'D'), ('C', 'D')} + +def test_add_edges_to_context_graph_2(simple_contig): + context_graph = nx.Graph() + + #simple_contig families : A B-C-D E F + + add_edges_to_context_graph(context_graph, + contig_genes = simple_contig.genes, + contig_windows = [(1,3)], + t=1, + is_circular=simple_contig.is_circular) + + nodes = sorted([n.name for n in context_graph.nodes()]) + edges = {tuple(sorted([n.name, v.name])) for n, v in context_graph.edges()} + + assert nodes == ["B", "C", "D"] + assert edges == {('B', 'C'), + ('C', 'D')} + +def test_add_edges_to_context_graph_linear(simple_contig): + + # genes : 1-2-3-4-5-6 + # families : A-B-C-D-E-F + # windows : _____ ___ [(0,2) (4,5)] + + + context_graph = nx.Graph() + + add_edges_to_context_graph(context_graph, + contig_genes = simple_contig.genes, + contig_windows = [(4,5), (0,2)], + t=1, + is_circular=False) + + nodes = sorted([n.name for n in context_graph.nodes()]) + edges = {tuple(sorted([n.name, v.name])) for n, v in context_graph.edges()} + + assert nodes == ["A", "B", "C", "E", "F"] + assert edges == {('A', 'B'), + ('B', 'C'), + ('E', "F"), + } + + +def test_add_edges_to_context_graph_circular(simple_contig): + + # genes : 1-2-3-4-5-6 + # families : A-B-C-D-E-F + # windows : _____ ___ [(0,2) (4,5)] + + context_graph = nx.Graph() + + add_edges_to_context_graph(context_graph, + contig_genes = simple_contig.genes, + contig_windows = [(4,5), (0,2)], + t=1, + is_circular=True) + + nodes = sorted([n.name for n in context_graph.nodes()]) + edges = {tuple(sorted([n.name, v.name])) for n, v in context_graph.edges()} + + assert nodes == ["A", "B", "C", "E", "F"] + assert edges == {('A', 'B'), + ('B', 'C'), + ('E', "F"), + ('A', 'F')} # circular so F and A are linked \ No newline at end of file From a0732f57910a6306ebfd9e757fdf1c1ee253d70f Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 5 May 2023 11:05:02 +0200 Subject: [PATCH 008/173] add docstring and type to function compute_gene_context_graph --- ppanggolin/context/searchGeneContext.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index a98b1062..ac385963 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -7,7 +7,7 @@ import time import logging import os -from typing import List, Dict, Tuple, Iterable, Hashable +from typing import List, Dict, Tuple, Iterable, Hashable, Iterator from itertools import zip_longest, chain # installed libraries @@ -182,8 +182,22 @@ def update_edge_attribute_counter(edge_dict: dict, key:Hashable): edge_dict[key] = 1 -def get_n_next_genes_index(current_index:int, next_genes_count:int, contig_size:int, is_circular:bool = False): - # Check if any position of interest is out of range +def get_n_next_genes_index(current_index: int, next_genes_count: int, contig_size: int, is_circular: bool = False) -> Iterator[int]: + """ + Generate the indices of the next genes based on the current index and contig properties. + + :param current_index: The index of the current gene. + :param next_genes_count: The number of next genes to consider. + :param contig_size: The total number of genes in the contig. + :param is_circular: Flag indicating whether the contig is circular (default: False). + :return: An iterator yielding the indices of the next genes. + + Raises: + - IndexError: If the current index is out of range for the given contig size. + + """ + + # Check if the current index is out of range if current_index >= contig_size: raise IndexError(f'current gene index is out of range. ' f"Contig has {contig_size} genes while the given gene index is {current_index}") From 4725e9fcf06294e8e86880362ac8bc808f09be32 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 5 May 2023 11:51:09 +0200 Subject: [PATCH 009/173] add function compute_gene_context_graph and test --- ppanggolin/context/searchGeneContext.py | 123 +++++++++--------------- tests/context/test_context.py | 33 ++++++- 2 files changed, 77 insertions(+), 79 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index ac385963..f2f4b0d7 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -7,8 +7,9 @@ import time import logging import os -from typing import List, Dict, Tuple, Iterable, Hashable, Iterator +from typing import List, Dict, Tuple, Iterable, Hashable, Iterator, Set from itertools import zip_longest, chain +from collections import defaultdict # installed libraries from tqdm import tqdm @@ -18,10 +19,11 @@ # local libraries from ppanggolin.formats import check_pangenome_info from ppanggolin.genome import Gene, Contig -from ppanggolin.utils import mk_outdir, restricted_float, add_gene, connected_components +from ppanggolin.utils import mk_outdir, restricted_float, connected_components from ppanggolin.pangenome import Pangenome from ppanggolin.align.alignOnPang import get_seq2pang, project_partition from ppanggolin.region import GeneContext +from ppanggolin.geneFamily import GeneFamily def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: str, sequences: str = None, @@ -80,10 +82,14 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: # Compute the graph with transitive closure size provided as parameter start_time = time.time() + logging.getLogger().info("Building the graph...") + gene_context_graph = compute_gene_context_graph(families=gene_families, t=transitive, half_window=half_window, disable_bar=disable_bar) + logging.getLogger().info( f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find common gene contexts") + logging.getLogger().debug(f"There are {nx.number_of_nodes(gene_context_graph)} nodes and {nx.number_of_edges(gene_context_graph)} edges") @@ -266,91 +272,55 @@ def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int] return windows_coordinates - -def compute_gene_context_graph(families: dict, t: int = 4, half_window: int = 0, disable_bar: bool = False) -> nx.Graph: +def get_contig_to_genes(gene_families: Iterable[GeneFamily]) -> Dict[Contig, Set[Gene]]: """ - Construct the graph of gene contexts between families of the pan - - :param families: Gene families of interest - :param t: transitive value - :param half_window: An integer specifying the number of genes to include in the context on each side of the gene of interest. - :param disable_bar: Prevents progress bar printing + Group genes from specified gene families by contig. - :return: Graph of gene contexts between interesting gene families of the pan - """ - - g = nx.Graph() - for family in tqdm(families.values(), unit="families", disable=disable_bar): - for gene in family.genes: - contig = gene.contig.genes - pos_left, in_context_left, pos_right, in_context_right = extract_gene_context(gene, contig, families, t, half_window) - if in_context_left or in_context_right: - for env_gene in contig[pos_left:pos_right + 1]: - _compute_gene_context_graph(g, env_gene, contig, pos_right) - return g - - -def _compute_gene_context_graph(g: nx.Graph, env_gene: Gene, contig: Contig, pos_r: int): - """ - Compute graph of gene contexts between one gene and the other part of the contig - - :param: Graph of gene contexts between interesting gene families of the pan - :param env_gene: Gene of the current position - :param contig: Current contig to search a gene context - :param pos_r: Gene to search a gene context + :param gene_families: An iterable of gene families object. + + :return: A dictionary mapping contigs to sets of genes. """ - - g.add_node(env_gene.family) - add_gene(g.nodes[env_gene.family], env_gene, fam_split=False) - pos = env_gene.position + 1 - while pos <= pos_r: - if env_gene.family != contig[pos].family: - g.add_edge(env_gene.family, contig[pos].family) - edge = g[env_gene.family][contig[pos].family] - add_gene(edge, env_gene) - add_gene(edge, contig[pos]) - pos += 1 - + + contig_to_genes_of_interest = defaultdict(set) + for gene_family in gene_families: + for gene in gene_family.genes: + contig = gene.contig + contig_to_genes_of_interest[contig].add(gene) + return contig_to_genes_of_interest -def extract_gene_context(gene: Gene, contig: List[Gene], families: Dict[str, str], t: int = 4, half_window: int = 0) -> Tuple[int, bool, int, bool]: +def compute_gene_context_graph(families: Iterable[GeneFamily], transitive: int = 4, window_size: int = 0, disable_bar: bool = False) -> nx.Graph: """ - Determine the left and rigth position of the gene context and whether said gene context exists. + Construct the graph of gene contexts between families of the pangenome. - :param gene: Gene of interest - :param contig: list of genes in contig - :param families: Alignment results - :param t: transitive value - :param half_window: An integer specifying the number of genes to include in the context on each side of the gene of interest. + :param families: An iterable of gene families. + :param transitive: Size of the transitive closure used to build the graph. + :param window_size: Size of the window for extracting gene contexts (default: 0). + :param disable_bar: Flag to disable the progress bar (default: False). + :return: The constructed gene context graph. - :return: Position of the context and if it exists for each side ('left' and 'right') """ - search_window = max(t, half_window) - - pos_left, pos_right = (max(0, gene.position - search_window), - min(gene.position + search_window, len(contig) - 1)) # Gene positions to compare family + context_graph = nx.Graph() - in_context_left, in_context_right = (False, False) - while pos_left < gene.position and not in_context_left: - if gene.position - pos_left <= half_window: - # position is in the window - in_context_left = True - - elif contig[pos_left].family in families.values(): - in_context_left = True - else: - pos_left += 1 - - while pos_right > gene.position and not in_context_right: - if pos_right - gene.position <= half_window: - in_context_right = True - elif contig[pos_right].family in families.values(): - in_context_right = True - else: - pos_right -= 1 + contig_to_genes_of_interest = get_contig_to_genes(families) + + for contig, genes_of_interest in tqdm(contig_to_genes_of_interest.items(), unit="contig", total=len(contig_to_genes_of_interest), disable=disable_bar): + logging.debug(f'Processing {len(genes_of_interest)} genes of interest in contig {contig}') + + genes_count = len(contig.genes) + + genes_of_interest_positions = [g.position for g in genes_of_interest] - return pos_left, in_context_left, pos_right, in_context_right + contig_windows = extract_contig_window(genes_count, genes_of_interest_positions, + window_size=window_size, is_circular=contig.is_circular) + + add_edges_to_context_graph(context_graph, + contig.genes, + contig_windows, + transitive, + contig.is_circular) + return context_graph def compute_gene_context(g: nx.Graph, jaccard: float = 0.85) -> set: @@ -391,7 +361,8 @@ def fam2seq(seq_to_pan: dict) -> dict: def export_to_dataframe(families: set, gene_contexts: set, fam_to_seq: dict, output: str): - """ Export the results into dataFrame + """ + Export the results into dataFrame :param families: Families related to the connected components :param gene_contexts: connected components found in the pan diff --git a/tests/context/test_context.py b/tests/context/test_context.py index 264cbf8f..1fd45d62 100644 --- a/tests/context/test_context.py +++ b/tests/context/test_context.py @@ -1,5 +1,5 @@ import pytest -from ppanggolin.context.searchGeneContext import extract_contig_window, get_n_next_genes_index, add_edges_to_context_graph +from ppanggolin.context.searchGeneContext import extract_contig_window, get_n_next_genes_index, add_edges_to_context_graph, compute_gene_context_graph from ppanggolin.geneFamily import GeneFamily from ppanggolin.genome import Gene, Contig @@ -71,7 +71,9 @@ def simple_contig(): for i, (gene, family_name) in enumerate(zip(genes, 'ABCDEFGHIJKLMNOP')): family = GeneFamily(i, family_name) gene.fill_annotations(start=0, stop=0, strand=0, position=i) - + + gene.fill_parents("organism A", contig) + contig.add_gene(gene) family.add_gene(gene) @@ -182,4 +184,29 @@ def test_add_edges_to_context_graph_circular(simple_contig): ('B', 'C'), ('E', "F"), ('A', 'F')} # circular so F and A are linked - \ No newline at end of file + + +def test_compute_gene_context_graph(simple_contig): + + # genes : 0-1-2-3-4-5 + # families : A-B-C-D-E-F + # family of interest : ^ + # windows of 2 : ___ ___ + + # simple case with only one contig with 6 genes and 6 families + + families_in_contigs = [g.family for g in simple_contig.genes ] + family_names_of_interest = ["C"] + families_of_interest = {f for f in families_in_contigs if f.name in family_names_of_interest } + + context_graph = compute_gene_context_graph(families_of_interest, + transitive=1, + window_size = 2) + nodes = sorted([n.name for n in context_graph.nodes()]) + edges = {tuple(sorted([n.name, v.name])) for n, v in context_graph.edges()} + + assert nodes == ["A", "B", "C", "D", "E"] + assert edges == {('A', 'B'), + ('B', 'C'), + ('C', "D"), + ('D', 'E')} \ No newline at end of file From bb5ecbd8f022034f86253c7294f82902a09b3c37 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 9 May 2023 09:59:45 +0200 Subject: [PATCH 010/173] write graph to investigate found contexts --- ppanggolin/context/searchGeneContext.py | 98 +++++++++++++++++++------ 1 file changed, 77 insertions(+), 21 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index f2f4b0d7..110f88d2 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -28,7 +28,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: str, sequences: str = None, families: str = None, transitive: int = 4, identity: float = 0.5, - coverage: float = 0.8, jaccard: float = 0.85, window_size: int = 1, no_defrag: bool = False, + coverage: float = 0.8, jaccard_threshold: float = 0.85, window_size: int = 1, no_defrag: bool = False, cpu: int = 1, disable_bar=True): """ Main function to search common gene contexts between sequence set and pangenome families @@ -41,7 +41,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: :param transitive: number of genes to check on both sides of a family aligned with an input sequence :param identity: minimum identity threshold between sequences and gene families for the alignment :param coverage: minimum coverage threshold between sequences and gene families for the alignment - :param jaccard: Jaccard index to filter edges in graph + :param jaccard_threshold: Jaccard index threshold to filter edges in graph :param window_size: Number of genes to consider in the gene context. :param no_defrag: do not use the defrag workflow if true :param cpu: Number of core used to process @@ -55,7 +55,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) - gene_families = {} + gene_families = set() fam_2_seq = None if sequences is not None: @@ -68,45 +68,100 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: new_tmpdir.cleanup() for pan_family in seq2pan.values(): - gene_families[pan_family.name] = pan_family + gene_families.add(pan_family) fam_2_seq = fam2seq(seq2pan) if families is not None: with open(families, 'r') as f: for fam_name in f.read().splitlines(): - gene_families[fam_name] = pangenome.get_gene_family(fam_name) + gene_families.add(pangenome.get_gene_family(fam_name)) - half_window = round((window_size-1)/2) - logging.info(f'Window size of {half_window*2 + 1}. Gene context will include {half_window} genes on each side of the target gene.') + # half_window = round((window_size-1)/2) + # logging.info(f'Window size of {half_window*2 + 1}. Gene context will include {half_window} genes on each side of the target gene.') # Compute the graph with transitive closure size provided as parameter start_time = time.time() logging.getLogger().info("Building the graph...") - gene_context_graph = compute_gene_context_graph(families=gene_families, t=transitive, half_window=half_window, disable_bar=disable_bar) + gene_context_graph = compute_gene_context_graph(families=gene_families, transitive=transitive, window_size=window_size, disable_bar=disable_bar) logging.getLogger().info( f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find common gene contexts") logging.getLogger().debug(f"There are {nx.number_of_nodes(gene_context_graph)} nodes and {nx.number_of_edges(gene_context_graph)} edges") - + + compute_edge_metrics(gene_context_graph, jaccard_threshold) + + + write_graph(gene_context_graph, output, gene_families) # extract the modules from the graph - common_components = compute_gene_context(gene_context_graph, jaccard) + # common_components = compute_gene_context(gene_context_graph, jaccard) - families = set() - for gene_context in common_components: - families |= gene_context.families + # families = set() + # for gene_context in common_components: + # families |= gene_context.families - if len(families) != 0: - export_to_dataframe(families, common_components, fam_2_seq, output) - else: - logging.getLogger().info(f"No gene contexts were found") + # if len(families) != 0: + # export_to_dataframe(families, common_components, fam_2_seq, output) + # else: + # logging.getLogger().info(f"No gene contexts were found") logging.getLogger().info(f"Computing gene contexts took {round(time.time() - start_time, 2)} seconds") +def write_graph(context_graph:nx.Graph, output_dir:str, famillies_of_interest): + + def filter_edge_attribute(data): + return {k:v for k, v in data.items() if type(v) != set} + + G = nx.Graph() + + G.add_edges_from(((f1.name, f2.name) for f1,f2 in context_graph.edges())) + + edges_with_attributes = {(f1.name, f2.name):filter_edge_attribute(d) for f1,f2,d in context_graph.edges(data=True)} + + nx.set_edge_attributes(G, edges_with_attributes) + + nodes_data = {f.name:{"organisms":len(f.organisms), "genes":len(f.genes), "famillies_of_interest": f in famillies_of_interest} for f in context_graph.nodes()} + + for f, d in G.nodes(data=True): + d.update(nodes_data[f]) + + nx.write_graphml_lxml(G, os.path.join(output_dir, "graph_context.graphml")) + +def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff:float ): + # compute jaccard on organism + for f1, f2, data in context_graph.edges(data=True): + + data['jaccard_organism'] = len(data['organisms'])/len(f1.organisms | f2.organisms) + + data['min_jaccard_organism'] = len(data['organisms'])/min(len(f1.organisms), len(f2.organisms)) + data['max_jaccard_organism'] = len(data['organisms'])/max(len(f1.organisms), len(f2.organisms)) + + + # print("f1", f1) + # print("len data[f1]", len(data[f1])) + + # print('len(f1.genes)', len(f1.genes)) + + f1_gene_proportion = len(data[f1])/len(f1.genes) + f2_gene_proportion = len(data[f2])/len(f2.genes) + + data[f'f1'] = f1.name + data[f'f2'] = f2.name + data[f'f1_gene_proportion'] = f1_gene_proportion + data[f'f2_gene_proportion'] = f2_gene_proportion + + data[f'is_gene_proportion_>_{gene_proportion_cutoff}'] = (f1_gene_proportion >= gene_proportion_cutoff) and (f2_gene_proportion >= gene_proportion_cutoff) + + # for k, v in data.items(): + # if type(v) != set: + # print(k, v) + # print('===') + + def add_edges_to_context_graph(context_graph: nx.Graph, contig_genes: Iterable[Gene], @@ -297,8 +352,8 @@ def compute_gene_context_graph(families: Iterable[GeneFamily], transitive: int = :param transitive: Size of the transitive closure used to build the graph. :param window_size: Size of the window for extracting gene contexts (default: 0). :param disable_bar: Flag to disable the progress bar (default: False). - :return: The constructed gene context graph. + :return: The constructed gene context graph. """ context_graph = nx.Graph() @@ -404,7 +459,7 @@ def launch(args: argparse.Namespace): pangenome.add_file(args.pangenome) search_gene_context_in_pangenome(pangenome=pangenome, output=args.output, tmpdir=args.tmpdir, sequences=args.sequences, families=args.family, transitive=args.transitive, - identity=args.identity, coverage=args.coverage, jaccard=args.jaccard, window_size=args.window_size, + identity=args.identity, coverage=args.coverage, jaccard_threshold=args.jaccard, window_size=args.window_size, no_defrag=args.no_defrag, cpu=args.cpu, disable_bar=args.disable_prog_bar) @@ -452,8 +507,9 @@ def parser_context(parser: argparse.ArgumentParser): help="Size of the transitive closure used to build the graph. This indicates the number of " "non related genes allowed in-between two related genes. Increasing it will improve " "precision but lower sensitivity a little.") - optional.add_argument("-w", "--window_size", required=False, type=int, default=1, - help="Number of neighboring genes that are considered when searching for conserved genomic contexts around a gene of interest.") + optional.add_argument("-w", "--window_size", required=False, type=int, default=5, + help="Number of neighboring genes that are considered on each side of " + "a gene of interest when searching for conserved genomic contexts.") optional.add_argument("-s", "--jaccard", required=False, type=restricted_float, default=0.85, help="minimum jaccard similarity used to filter edges between gene families. Increasing it " From a6bb109c71679a139dab5101499d1ee3e8da65e7 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 12 May 2023 17:27:00 +0200 Subject: [PATCH 011/173] filter and output contexts in tsv and in graph --- ppanggolin/context/searchGeneContext.py | 281 ++++++++++++++++-------- 1 file changed, 192 insertions(+), 89 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 110f88d2..35470750 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -29,7 +29,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: str, sequences: str = None, families: str = None, transitive: int = 4, identity: float = 0.5, coverage: float = 0.8, jaccard_threshold: float = 0.85, window_size: int = 1, no_defrag: bool = False, - cpu: int = 1, disable_bar=True): + cpu: int = 1, write_context_graph:bool = False, disable_bar=True): """ Main function to search common gene contexts between sequence set and pangenome families @@ -45,6 +45,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: :param window_size: Number of genes to consider in the gene context. :param no_defrag: do not use the defrag workflow if true :param cpu: Number of core used to process + :param write_context_graph: Write graph of the contexts :param disable_bar: Allow preventing bar progress print """ @@ -90,77 +91,159 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: logging.getLogger().info( f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find common gene contexts") - logging.getLogger().debug(f"There are {nx.number_of_nodes(gene_context_graph)} nodes and {nx.number_of_edges(gene_context_graph)} edges") + logging.getLogger().debug(f"Context graph made of {nx.number_of_nodes(gene_context_graph)} nodes and {nx.number_of_edges(gene_context_graph)} edges") compute_edge_metrics(gene_context_graph, jaccard_threshold) + # Filter graph + filter_flag = f'is_jaccard_gene_>_{jaccard_threshold}' + + filtered_graph = nx.subgraph_view(gene_context_graph, filter_edge=lambda n1, n2: gene_context_graph[n1][n2][filter_flag] ) + + logging.getLogger().debug(f"Filtering context graph on {filter_flag}") + logging.getLogger().debug(f"Context graph made of {nx.number_of_nodes(filtered_graph)} nodes and {nx.number_of_edges(filtered_graph)} edges") + connected_components = nx.connected_components(filtered_graph) + + # Connected component graph Filtering + + # remove singleton famillies + connected_components = (component for component in connected_components if len(component) > 1) - write_graph(gene_context_graph, output, gene_families) + # remove component made only of famillies not initially requested + connected_components = (component for component in connected_components if component & gene_families) - # extract the modules from the graph - # common_components = compute_gene_context(gene_context_graph, jaccard) + gene_contexts = {GeneContext(gc_id=i, families=component) for i, component in enumerate(connected_components) } - # families = set() - # for gene_context in common_components: - # families |= gene_context.families + families_in_contexts = {family for gene_context in gene_contexts for family in gene_context.families} + + graph_with_final_contexts = nx.subgraph_view(gene_context_graph, filter_node=lambda n: n in families_in_contexts) + + if write_context_graph: + write_graph(graph_with_final_contexts, output, gene_families, gene_contexts) + + if len(families_in_contexts) != 0: + logging.getLogger().debug(f"There are {len(families_in_contexts)} families among {len(gene_contexts)} gene contexts") + + output_file = os.path.join(output, "gene_contexts.tsv") - # if len(families) != 0: - # export_to_dataframe(families, common_components, fam_2_seq, output) - # else: - # logging.getLogger().info(f"No gene contexts were found") + export_context_to_dataframe(gene_contexts, fam_2_seq, output_file) + + else: + logging.getLogger().info(f"No gene contexts were found") logging.getLogger().info(f"Computing gene contexts took {round(time.time() - start_time, 2)} seconds") -def write_graph(context_graph:nx.Graph, output_dir:str, famillies_of_interest): - def filter_edge_attribute(data): - return {k:v for k, v in data.items() if type(v) != set} - + # # Finding connected components with panmodule functions + # # extract the modules from the graph + + # logging.getLogger().debug(f"panmodule style:") + # gene_contexts_pandmodule_way = compute_gene_context(gene_context_graph, jaccard_threshold) + + # # remove singleton famillies + # gene_contexts_pandmodule_way = (context for context in gene_contexts_pandmodule_way if len(context.families) > 1 ) + + # # remove component made only of famillies not initially requested + # gene_contexts_pandmodule_way = [context for context in gene_contexts_pandmodule_way if context.families & gene_families] + + # families_in_contexts = {family for gene_context in gene_contexts_pandmodule_way for family in gene_context.families} + + # logging.getLogger().debug(f"There are {len(families_in_contexts)} families among {len(gene_contexts_pandmodule_way)} gene contexts") + + + # output_file = os.path.join(output, "gene_contexts_panmodule_style.tsv") + # export_context_to_dataframe(gene_contexts_pandmodule_way, fam_2_seq, output_file) + + + +def write_graph(context_graph: nx.Graph, output_dir: str, famillies_of_interest: Set[GeneFamily], gene_contexts:List[GeneContext]): + """ + Write a graph to file with node and edge attributes. + + This function writes a graph to a file in the GraphML format or in GEXF format. The original context graph contains + ppanggolin objects as nodes and lists and dictionaries in edge attributes. Since these objects + cannot be written to the output graph, this function creates a new graph that contains only + writable objects. + + :param context_graph: A NetworkX Graph object representing the graph. + :param output_dir: The output directory where the graph file will be written. + :param famillies_of_interest: A list of node objects that are of interest. + :param gene_contexts: List of gene context, used to add context id to node of the graph + + """ + def filter_attribute(data:dict): + """ + Helper function to filter the edge attributes. + + :param data: The edge attribute data. + :return: A filtered dictionary containing only non-collection attributes. + """ + return {k:v for k, v in data.items() if type(v) not in [set, dict, list]} + G = nx.Graph() G.add_edges_from(((f1.name, f2.name) for f1,f2 in context_graph.edges())) - edges_with_attributes = {(f1.name, f2.name):filter_edge_attribute(d) for f1,f2,d in context_graph.edges(data=True)} + edges_with_attributes = {(f1.name, f2.name):filter_attribute(d) for f1,f2,d in context_graph.edges(data=True)} nx.set_edge_attributes(G, edges_with_attributes) - - nodes_data = {f.name:{"organisms":len(f.organisms), "genes":len(f.genes), "famillies_of_interest": f in famillies_of_interest} for f in context_graph.nodes()} + nodes_attributes_filtered = {f.name:filter_attribute(d) for f,d in context_graph.nodes(data=True)} + + # on top of attributes already contained in node of context graph + # add organisms and genes count that have the family, the partition and if the family was in initially requested + nodes_family_data = {f.name:{"organisms":len(f.organisms), + "partition":f.partition, + "genes":len(f.genes), + "famillies_of_interest": f in famillies_of_interest} for f in context_graph.nodes()} + + family_name_to_context_id = {family.name:context.ID for context in gene_contexts for family in context.families} + for f, d in G.nodes(data=True): - d.update(nodes_data[f]) + d.update(nodes_family_data[f]) + d.update(nodes_attributes_filtered[f]) + d['context_id'] = family_name_to_context_id[f] + + + graphml_file = os.path.join(output_dir, "graph_context.graphml") + logging.info(f'Writting context graph in {graphml_file}') + nx.write_graphml_lxml(G, graphml_file) + + gexf_file = os.path.join(output_dir, "graph_context.gexf") + logging.info(f'Writting context graph in {gexf_file}') + nx.readwrite.gexf.write_gexf(G, gexf_file) - nx.write_graphml_lxml(G, os.path.join(output_dir, "graph_context.graphml")) -def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff:float ): - # compute jaccard on organism +def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff: float) -> None: + """ + Compute various metrics on the edges of the context graph. + + :param context_graph: The context graph. + :param gene_proportion_cutoff: The minimum proportion of shared genes between two features for their edge to be considered significant. + """ + # compute jaccard on organism and on genes for f1, f2, data in context_graph.edges(data=True): data['jaccard_organism'] = len(data['organisms'])/len(f1.organisms | f2.organisms) - data['min_jaccard_organism'] = len(data['organisms'])/min(len(f1.organisms), len(f2.organisms)) - data['max_jaccard_organism'] = len(data['organisms'])/max(len(f1.organisms), len(f2.organisms)) - - - # print("f1", f1) - # print("len data[f1]", len(data[f1])) - - # print('len(f1.genes)', len(f1.genes)) - - f1_gene_proportion = len(data[f1])/len(f1.genes) - f2_gene_proportion = len(data[f2])/len(f2.genes) + f1_gene_proportion = len(data['genes'][f1])/len(f1.genes) + f2_gene_proportion = len(data['genes'][f2])/len(f2.genes) data[f'f1'] = f1.name data[f'f2'] = f2.name - data[f'f1_gene_proportion'] = f1_gene_proportion - data[f'f2_gene_proportion'] = f2_gene_proportion + data[f'f1_jaccard_gene'] = f1_gene_proportion + data[f'f2_jaccard_gene'] = f2_gene_proportion - data[f'is_gene_proportion_>_{gene_proportion_cutoff}'] = (f1_gene_proportion >= gene_proportion_cutoff) and (f2_gene_proportion >= gene_proportion_cutoff) + data[f'is_jaccard_gene_>_{gene_proportion_cutoff}'] = (f1_gene_proportion >= gene_proportion_cutoff) and (f2_gene_proportion >= gene_proportion_cutoff) - # for k, v in data.items(): - # if type(v) != set: - # print(k, v) - # print('===') - + # the following commented out lines are additional metrics that could be used + + # data['min_jaccard_organism'] = len(data['organisms'])/min(len(f1.organisms), len(f2.organisms)) + # data['max_jaccard_organism'] = len(data['organisms'])/max(len(f1.organisms), len(f2.organisms)) + # f1_gene_proportion_partial = len(data['genes'][f1])/len(context_graph.nodes[f1]['genes']) + # f2_gene_proportion_partial = len(data['genes'][f2])/len(context_graph.nodes[f2]['genes']) + # data[f'f1_jaccard_gene_partital'] = f1_gene_proportion_partial + # data[f'f2_jaccard_gene_partital'] = f2_gene_proportion_partial def add_edges_to_context_graph(context_graph: nx.Graph, @@ -185,7 +268,7 @@ def add_edges_to_context_graph(context_graph: nx.Graph, contig_size=len(contig_genes), is_circular=is_circular) next_genes = list(next_genes) - for next_gene_index in next_genes: + for i, next_gene_index in enumerate(next_genes): # Check if the next gene is within the contig windows if not any(lower <= next_gene_index <= upper for (lower, upper) in contig_windows): # next_gene_index is not in any range of genes in the context @@ -200,39 +283,60 @@ def add_edges_to_context_graph(context_graph: nx.Graph, context_graph.add_edge(gene.family, next_gene.family) + if i == 0: + context_graph[gene.family][next_gene.family]['adjacent_family'] = True + + + # Add node attributes + node_gene_dict = context_graph.nodes[gene.family] + next_gene_gene_dict = context_graph.nodes[next_gene.family] + + increment_attribute_counter(node_gene_dict, "genes_count") + increment_attribute_counter(next_gene_gene_dict, "genes_count") + + add_val_to_dict_attribute(node_gene_dict, "genes", gene) + add_val_to_dict_attribute(next_gene_gene_dict, "genes", next_gene) + + # Add edge attributes edge_dict = context_graph[gene.family][next_gene.family] - add_val_to_edge_attribute(edge_dict, gene.family, gene) - add_val_to_edge_attribute(edge_dict, next_gene.family, next_gene) + try: + genes_edge_dict = edge_dict['genes'] + except: + genes_edge_dict = {} + edge_dict['genes'] = genes_edge_dict + + add_val_to_dict_attribute(genes_edge_dict, gene.family, gene) + add_val_to_dict_attribute(genes_edge_dict, next_gene.family, next_gene) - add_val_to_edge_attribute(edge_dict, "organisms", gene.organism) + add_val_to_dict_attribute(edge_dict, "organisms", gene.organism) - update_edge_attribute_counter(edge_dict, "gene_pairs") + increment_attribute_counter(edge_dict, "gene_pairs") - assert gene.organism == next_gene.organism + assert gene.organism == next_gene.organism -def add_val_to_edge_attribute(edge_dict: dict, attribute_key, attribute_value): +def add_val_to_dict_attribute(attr_dict: dict, attribute_key, attribute_value): """ - Add an edge attribute value to the edge dictionary set. + Add an attribute value to a edge or node dictionary set. - :param edge_dict: The dictionary containing the edge attributes. + :param attr_dict: The dictionary containing the edge/node attributes. :param attribute_key: The key of the attribute. :param attribute_value: The value of the attribute to be added. """ try: - edge_dict[attribute_key].add(attribute_value) + attr_dict[attribute_key].add(attribute_value) except KeyError: - edge_dict[attribute_key] = {attribute_value} + attr_dict[attribute_key] = {attribute_value} -def update_edge_attribute_counter(edge_dict: dict, key:Hashable): +def increment_attribute_counter(edge_dict: dict, key:Hashable): """ - Update the counter for an edge attribute in the edge dictionary. + Increment the counter for an edge/node attribute in the edge/node dictionary. - :param edge_dict: The dictionary containing the edge attributes. + :param edge_dict: The dictionary containing the attributes. :param key: The key of the attribute. """ @@ -361,7 +465,6 @@ def compute_gene_context_graph(families: Iterable[GeneFamily], transitive: int = contig_to_genes_of_interest = get_contig_to_genes(families) for contig, genes_of_interest in tqdm(contig_to_genes_of_interest.items(), unit="contig", total=len(contig_to_genes_of_interest), disable=disable_bar): - logging.debug(f'Processing {len(genes_of_interest)} genes of interest in contig {contig}') genes_count = len(contig.genes) @@ -378,22 +481,22 @@ def compute_gene_context_graph(families: Iterable[GeneFamily], transitive: int = return context_graph -def compute_gene_context(g: nx.Graph, jaccard: float = 0.85) -> set: - """ - Compute the gene contexts in the graph +# def compute_gene_context(g: nx.Graph, jaccard: float = 0.85) -> set: +# """ +# Compute the gene contexts in the graph - :param g: Graph of gene contexts between interesting gene families of the pan - :param jaccard: Jaccard index +# :param g: Graph of gene contexts between interesting gene families of the pan +# :param jaccard: Jaccard index - :return: Set of gene contexts find in graph - """ +# :return: Set of gene contexts find in graph +# """ - gene_contexts = set() - c = 1 - for comp in connected_components(g, removed=set(), weight=jaccard): - gene_contexts.add(GeneContext(gc_id=c, families=comp)) - c += 1 - return gene_contexts +# gene_contexts = set() +# c = 1 +# for comp in connected_components(g, removed=set(), weight=jaccard): +# gene_contexts.add(GeneContext(gc_id=c, families=comp)) +# c += 1 +# return gene_contexts def fam2seq(seq_to_pan: dict) -> dict: @@ -415,34 +518,33 @@ def fam2seq(seq_to_pan: dict) -> dict: return fam_2_seq -def export_to_dataframe(families: set, gene_contexts: set, fam_to_seq: dict, output: str): +def export_context_to_dataframe(gene_contexts: set, fam_to_seq: dict, output: str): """ Export the results into dataFrame - :param families: Families related to the connected components :param gene_contexts: connected components found in the pan :param fam_to_seq: Dictionary with gene families as keys and list of sequence ids as values :param output: output path """ - logging.getLogger().debug(f"There are {len(families)} families among {len(gene_contexts)} gene contexts") - lines = [] for gene_context in gene_contexts: for family in gene_context.families: - line = [gene_context.ID] - if fam_to_seq is None or fam_to_seq.get(family.ID) is None: - line += [family.name, None, len(family.organisms), family.named_partition] - else: - line += [family.name, ','.join(fam_to_seq.get(family.ID)), - len(family.organisms), family.named_partition] - lines.append(line) - df = pd.DataFrame(lines, - columns=["GeneContext ID", "Gene family name", "Sequence ID", "Nb Genomes", "Partition"] - ).set_index("GeneContext ID") - df.sort_values(["GeneContext ID", "Sequence ID"], na_position='last').to_csv( - path_or_buf=f"{output}/gene_contexts.tsv", sep="\t", na_rep='NA') - logging.getLogger(f"detected gene context(s) are listed in: '{output}/gene_contexts.tsv'") + + family_info = {"GeneContext ID":gene_context.ID, + "Gene family name": family.name, + "Sequence ID":None if fam_to_seq is None else ','.join(fam_to_seq.get(family.ID)), + "Nb Genomes":len(family.organisms), + "Partition": family.named_partition } + lines.append(family_info) + + df = pd.DataFrame(lines).set_index("GeneContext ID") + + df = df.sort_values(["GeneContext ID", "Sequence ID"], na_position='last') + + df.to_csv(output, sep="\t", na_rep='NA') + + logging.getLogger().debug(f"detected gene context(s) are listed in: '{output}") def launch(args: argparse.Namespace): @@ -460,7 +562,7 @@ def launch(args: argparse.Namespace): search_gene_context_in_pangenome(pangenome=pangenome, output=args.output, tmpdir=args.tmpdir, sequences=args.sequences, families=args.family, transitive=args.transitive, identity=args.identity, coverage=args.coverage, jaccard_threshold=args.jaccard, window_size=args.window_size, - no_defrag=args.no_defrag, cpu=args.cpu, disable_bar=args.disable_prog_bar) + no_defrag=args.no_defrag, cpu=args.cpu, write_context_graph=args.write_graph, disable_bar=args.disable_prog_bar) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -514,7 +616,8 @@ def parser_context(parser: argparse.ArgumentParser): optional.add_argument("-s", "--jaccard", required=False, type=restricted_float, default=0.85, help="minimum jaccard similarity used to filter edges between gene families. Increasing it " "will improve precision but lower sensitivity a lot.") - + optional.add_argument('--write_graph', action="store_true", + help="Write context graph in GEXF format.") if __name__ == '__main__': """To test local change and allow using debugger""" From ababedc6c2e50813b72067c057fc7dbf8cefa3a1 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 23 May 2023 13:48:52 +0200 Subject: [PATCH 012/173] fix bug in writting context table with seq input --- ppanggolin/context/searchGeneContext.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 35470750..551b3ed9 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -531,9 +531,14 @@ def export_context_to_dataframe(gene_contexts: set, fam_to_seq: dict, output: st for gene_context in gene_contexts: for family in gene_context.families: + if fam_to_seq is None or fam_to_seq.get(family.ID) is None: + sequence_id = None + else: + sequence_id = ','.join(fam_to_seq.get(family.ID)) + family_info = {"GeneContext ID":gene_context.ID, "Gene family name": family.name, - "Sequence ID":None if fam_to_seq is None else ','.join(fam_to_seq.get(family.ID)), + "Sequence ID":sequence_id, "Nb Genomes":len(family.organisms), "Partition": family.named_partition } lines.append(family_info) @@ -544,7 +549,7 @@ def export_context_to_dataframe(gene_contexts: set, fam_to_seq: dict, output: st df.to_csv(output, sep="\t", na_rep='NA') - logging.getLogger().debug(f"detected gene context(s) are listed in: '{output}") + logging.getLogger().debug(f"detected gene context(s) are listed in: '{output}'") def launch(args: argparse.Namespace): From 55124ee76e82e93929ca5ffed2fd428fa21db873 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 8 Jun 2023 14:24:05 +0200 Subject: [PATCH 013/173] export filttered graph with edges jaccard > cutoff --- ppanggolin/context/searchGeneContext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 551b3ed9..cbde1a20 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -116,7 +116,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: families_in_contexts = {family for gene_context in gene_contexts for family in gene_context.families} - graph_with_final_contexts = nx.subgraph_view(gene_context_graph, filter_node=lambda n: n in families_in_contexts) + graph_with_final_contexts = nx.subgraph_view(filtered_graph, filter_node=lambda n: n in families_in_contexts) if write_context_graph: write_graph(graph_with_final_contexts, output, gene_families, gene_contexts) From 12e938d66f3757673a27da46abfc3d92d673620a Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 12 Jun 2023 18:17:32 +0200 Subject: [PATCH 014/173] add projection command --- ppanggolin/__init__.py | 4 +- ppanggolin/main.py | 3 + ppanggolin/projection/__init__.py | 1 + ppanggolin/projection/projection.py | 106 ++++++++++++++++++++++++++++ 4 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 ppanggolin/projection/__init__.py create mode 100644 ppanggolin/projection/projection.py diff --git a/ppanggolin/__init__.py b/ppanggolin/__init__.py index b797d552..b1b3ce64 100755 --- a/ppanggolin/__init__.py +++ b/ppanggolin/__init__.py @@ -11,6 +11,7 @@ import ppanggolin.mod import ppanggolin.context import ppanggolin.workflow +import ppanggolin.projection # import ppanggolin.utility @@ -33,5 +34,6 @@ "rgp":ppanggolin.RGP.genomicIsland.subparser, "spot":ppanggolin.RGP.spot.subparser, "module":ppanggolin.mod.subparser, - "context":ppanggolin.context.subparser,# "info":ppanggolin.info.subparser, "default_config":ppanggolin.utility.default_config.subparser + "context":ppanggolin.context.subparser, + "projection":ppanggolin.projection.subparser, # "info":ppanggolin.info.subparser, "default_config":ppanggolin.utility.default_config.subparser } diff --git a/ppanggolin/main.py b/ppanggolin/main.py index 8d9cd4d5..440f0ec5 100644 --- a/ppanggolin/main.py +++ b/ppanggolin/main.py @@ -55,6 +55,7 @@ def cmd_line() -> argparse.Namespace: desc += " partition Partition the pangenome graph\n" desc += " rarefaction Compute the rarefaction curve of the pangenome\n" desc += " msa Compute Multiple Sequence Alignments for pangenome gene families\n" + desc += " projection Projet a new genome to an existing pangenome\n" desc += " \n" desc += " Output:\n" desc += " draw Draw figures representing the pangenome through different aspects\n" @@ -182,6 +183,8 @@ def main(): ppanggolin.metrics.metrics.launch(args) elif args.subcommand == "align": ppanggolin.align.launch(args) + elif args.subcommand == "projection": + ppanggolin.align.launch(args) elif args.subcommand == "rgp": ppanggolin.RGP.genomicIsland.launch(args) elif args.subcommand == "spot": diff --git a/ppanggolin/projection/__init__.py b/ppanggolin/projection/__init__.py new file mode 100644 index 00000000..56bb37d6 --- /dev/null +++ b/ppanggolin/projection/__init__.py @@ -0,0 +1 @@ +from .projection import subparser, launch \ No newline at end of file diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py new file mode 100644 index 00000000..f74d7150 --- /dev/null +++ b/ppanggolin/projection/projection.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +# coding:utf-8 + +# default libraries +import argparse +from multiprocessing import get_context +import logging +import os +import time + +# installed libraries +from tqdm import tqdm + +# # local libraries +# from ppanggolin.annotate.synta import annotate_organism, read_fasta, get_dna_sequence +# from ppanggolin.pangenome import Pangenome +# from ppanggolin.genome import Organism, Gene, RNA, Contig +# from ppanggolin.utils import read_compressed_or_not, mk_file_name, min_one, restricted_float +# from ppanggolin.formats import write_pangenome + + +def launch(args: argparse.Namespace): + """ + Command launcher + + :param args: All arguments provide by user + """ + pass + + +def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: + """ + Subparser to launch PPanGGOLiN in Command line + + :param sub_parser : sub_parser for align command + + :return : parser arguments for align command + """ + parser = sub_parser.add_parser("projection", formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser_projection(parser) + return parser + + +def parser_projection(parser: argparse.ArgumentParser): + """ + Parser for specific argument of annotate command + + :param parser: parser for annotate argument + """ + required = parser.add_argument_group(title="Required arguments", + description="One of the following arguments is required :") + required.add_argument('-p', '--pangenome', required=False, type=str, help="The pangenome.h5 file") + + required.add_argument('--organism_name', required=False, type=str, + help="Name of the organism whose genome is being projected onto the provided pangenome.") + + required.add_argument('--fasta_file', required=False, type=str, + help="The filepath of the genomic sequence(s) in FASTA format for the projected genome. " + "(Fasta file can be compressed with gzip)") + + required.add_argument('--anno_file', required=False, type=str, + help="The filepath of the annotations in GFF/GBFF format for the projected genome. " + "(Annotation file can be compressed with gzip)") + + # required.add_argument('--fasta', required=False, type=str, + # help="A tab-separated file listing the organism names, and the fasta filepath of its genomic " + # "sequence(s) (the fastas can be compressed with gzip). One line per organism.") + + # required.add_argument('--anno', required=False, type=str, + # help="A tab-separated file listing the organism names, and the gff/gbff filepath of its " + # "annotations (the files can be compressed with gzip). One line per organism. " + # "If this is provided, those annotations will be used.") + + annotate = parser.add_argument_group(title="Annotation arguments") + + annotate.add_argument('-o', '--output', required=False, type=str, + default="ppanggolin_output" + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", + time.localtime()) + "_PID" + str(os.getpid()), + help="Output directory") + annotate.add_argument('--allow_overlap', required=False, action='store_true', default=False, + help="Use to not remove genes overlapping with RNA features.") + annotate.add_argument("--norna", required=False, action="store_true", default=False, + help="Use to avoid annotating RNA features.") + annotate.add_argument("--kingdom", required=False, type=str.lower, default="bacteria", + choices=["bacteria", "archaea"], + help="Kingdom to which the prokaryota belongs to, " + "to know which models to use for rRNA annotation.") + annotate.add_argument("--translation_table", required=False, type=int, default=11, + help="Translation table (genetic code) to use.") + annotate.add_argument("--basename", required=False, default="pangenome", help="basename for the output file") + + annotate.add_argument("--prodigal_procedure", required=False, type=str.lower, choices=["single", "meta"], + default=None, help="Allow to force the prodigal procedure. " + "If nothing given, PPanGGOLiN will decide in function of contig length") + + + cluster = parser.add_argument_group(title="Clustering arguments") + cluster.add_argument('--no_defrag', required=False, action="store_true", + help="DO NOT Realign gene families to link fragments with" + "their non-fragmented gene family.") + cluster.add_argument('--identity', required=False, type=float, default=0.5, + help="min identity percentage threshold") + cluster.add_argument('--coverage', required=False, type=float, default=0.8, + help="min coverage percentage threshold") + cluster.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") + From 5f3f62966fcc460f4a2b99b5fc8bbc84ba3ed161 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 14 Jun 2023 11:31:01 +0200 Subject: [PATCH 015/173] load less thing when just writing family prot seqs --- ppanggolin/formats/writeSequences.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 4f9a78b7..e759cf88 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -347,14 +347,18 @@ def write_sequence_files(pangenome: Pangenome, output: str, fasta: str = None, a need_regions = False need_modules = False - if any(x is not None for x in [regions, genes, gene_families, prot_families]): + if prot_families is not None: + need_families = True + + if any(x is not None for x in [regions, genes, gene_families]): need_annotations = True need_families = True if regions is not None or any(x == "rgp" for x in (genes, gene_families, prot_families)): + need_annotations = True need_regions = True if any(x in ["persistent", "shell", "cloud"] for x in (genes, gene_families, prot_families)): need_partitions = True - for x in (genes, gene_families, prot_families): + for x in (genes, gene_families): if x is not None and 'module_' in x: need_modules = True From fac6a006c2b5e6644ec3c0ad1acdbd21a7b9a799 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 20 Jun 2023 14:51:49 +0200 Subject: [PATCH 016/173] Add transitivity in edge attributes --- ppanggolin/context/searchGeneContext.py | 26 ++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 5901acb3..48e4811e 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -182,11 +182,13 @@ def filter_attribute(data:dict): G = nx.Graph() - G.add_edges_from(((f1.name, f2.name) for f1,f2 in context_graph.edges())) + G.add_edges_from((f1.name, f2.name, filter_attribute(d)) for f1, f2, d in context_graph.edges(data=True)) + - edges_with_attributes = {(f1.name, f2.name):filter_attribute(d) for f1,f2,d in context_graph.edges(data=True)} + # convert transitivity dict to str + edges_with_transitivity_str = {(f1.name, f2.name):str(d['transitivity']) for f1, f2, d in context_graph.edges(data=True)} - nx.set_edge_attributes(G, edges_with_attributes) + nx.set_edge_attributes(G, edges_with_transitivity_str, name="transitivity") nodes_attributes_filtered = {f.name:filter_attribute(d) for f,d in context_graph.nodes(data=True)} @@ -235,6 +237,12 @@ def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff: float) data[f'f2_jaccard_gene'] = f2_gene_proportion data[f'is_jaccard_gene_>_{gene_proportion_cutoff}'] = (f1_gene_proportion >= gene_proportion_cutoff) and (f2_gene_proportion >= gene_proportion_cutoff) + + transitivity_counter = data['transitivity'] + + mean_transitivity = sum((transitivity*counter for transitivity, counter in transitivity_counter.items()))/sum((counter for counter in transitivity_counter.values())) + + data['mean_transitivity'] = mean_transitivity # the following commented out lines are additional metrics that could be used @@ -283,10 +291,17 @@ def add_edges_to_context_graph(context_graph: nx.Graph, context_graph.add_edge(gene.family, next_gene.family) - if i == 0: - context_graph[gene.family][next_gene.family]['adjacent_family'] = True + edge_dict = context_graph[gene.family][next_gene.family] + if i == 0: + edge_dict['adjacent_family'] = True + + # Store information of the transitivity used to link the two genes: + if "transitivity" not in edge_dict: + edge_dict['transitivity'] = {i:0 for i in range(t +1)} + edge_dict['transitivity'][i] += 1 + # Add node attributes node_gene_dict = context_graph.nodes[gene.family] next_gene_gene_dict = context_graph.nodes[next_gene.family] @@ -375,6 +390,7 @@ def get_n_next_genes_index(current_index: int, next_genes_count: int, contig_siz if i == next_genes_count: break yield next_gene_index + def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int], window_size: int, is_circular:bool = False): """ From d9f8a9c5de332b2e3a297a1fba2ddd0ae3a4eb80 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 20 Jun 2023 14:55:51 +0200 Subject: [PATCH 017/173] add metadata subparser in __init__ --- ppanggolin/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ppanggolin/__init__.py b/ppanggolin/__init__.py index b797d552..57657904 100755 --- a/ppanggolin/__init__.py +++ b/ppanggolin/__init__.py @@ -11,6 +11,7 @@ import ppanggolin.mod import ppanggolin.context import ppanggolin.workflow +import ppanggolin.meta # import ppanggolin.utility @@ -33,5 +34,6 @@ "rgp":ppanggolin.RGP.genomicIsland.subparser, "spot":ppanggolin.RGP.spot.subparser, "module":ppanggolin.mod.subparser, - "context":ppanggolin.context.subparser,# "info":ppanggolin.info.subparser, "default_config":ppanggolin.utility.default_config.subparser + "context":ppanggolin.context.subparser, + "metadata":ppanggolin.meta.subparser # "info":ppanggolin.info.subparser, "default_config":ppanggolin.utility.default_config.subparser } From 05127ded5ce1ba76e6d0432c80fce33343464a67 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 21 Jun 2023 11:09:00 +0200 Subject: [PATCH 018/173] add graph in gene context object and return them This useful for panorama --- ppanggolin/context/searchGeneContext.py | 133 ++++++++++++++---------- ppanggolin/region.py | 45 ++++++-- 2 files changed, 113 insertions(+), 65 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 48e4811e..515a9aeb 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -56,7 +56,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) - gene_families = set() + families_of_interest = set() fam_2_seq = None if sequences is not None: @@ -69,14 +69,14 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: new_tmpdir.cleanup() for pan_family in seq2pan.values(): - gene_families.add(pan_family) + families_of_interest.add(pan_family) fam_2_seq = fam2seq(seq2pan) if families is not None: with open(families, 'r') as f: for fam_name in f.read().splitlines(): - gene_families.add(pangenome.get_gene_family(fam_name)) + families_of_interest.add(pangenome.get_gene_family(fam_name)) # half_window = round((window_size-1)/2) # logging.info(f'Window size of {half_window*2 + 1}. Gene context will include {half_window} genes on each side of the target gene.') @@ -86,7 +86,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: logging.getLogger().info("Building the graph...") - gene_context_graph = compute_gene_context_graph(families=gene_families, transitive=transitive, window_size=window_size, disable_bar=disable_bar) + gene_context_graph = compute_gene_context_graph(families=families_of_interest, transitive=transitive, window_size=window_size, disable_bar=disable_bar) logging.getLogger().info( f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find common gene contexts") @@ -102,27 +102,14 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: logging.getLogger().debug(f"Filtering context graph on {filter_flag}") logging.getLogger().debug(f"Context graph made of {nx.number_of_nodes(filtered_graph)} nodes and {nx.number_of_edges(filtered_graph)} edges") - connected_components = nx.connected_components(filtered_graph) - # Connected component graph Filtering - - # remove singleton famillies - connected_components = (component for component in connected_components if len(component) > 1) - - # remove component made only of famillies not initially requested - connected_components = (component for component in connected_components if component & gene_families) - - gene_contexts = {GeneContext(gc_id=i, families=component) for i, component in enumerate(connected_components) } - - families_in_contexts = {family for gene_context in gene_contexts for family in gene_context.families} - - graph_with_final_contexts = nx.subgraph_view(filtered_graph, filter_node=lambda n: n in families_in_contexts) + gene_contexts = get_gene_contexts(filtered_graph, families_of_interest) if write_context_graph: - write_graph(graph_with_final_contexts, output, gene_families, gene_contexts) + write_graph(gene_contexts, output, families_of_interest, graph_format=['graphml', "gexf"]) - if len(families_in_contexts) != 0: - logging.getLogger().debug(f"There are {len(families_in_contexts)} families among {len(gene_contexts)} gene contexts") + if len(gene_contexts) != 0: + logging.getLogger().debug(f"There are {sum((len(gc) for gc in gene_contexts))} families among {len(gene_contexts)} gene contexts") output_file = os.path.join(output, "gene_contexts.tsv") @@ -133,7 +120,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: logging.getLogger().info(f"Computing gene contexts took {round(time.time() - start_time, 2)} seconds") - + return gene_contexts # # Finding connected components with panmodule functions # # extract the modules from the graph @@ -155,20 +142,60 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: # export_context_to_dataframe(gene_contexts_pandmodule_way, fam_2_seq, output_file) +def get_gene_contexts(context_graph: nx.Graph, families_of_interest: Set[GeneFamily]) -> Set[GeneContext]: + """ + Extract gene contexts from a context graph based on the provided set of gene families of interest. + + Gene contexts are extracted from a context graph by identifying connected components. + The function filters the connected components based on the following criteria: + - Remove singleton families (components with only one gene family). + - Remove components that do not contain any gene families of interest. + + For each remaining connected component, a GeneContext object is created. + + :param context_graph: The context graph from which to extract gene contexts. + :param families_of_interest: Set of gene families of interest. + :return: Set of GeneContext objects representing the extracted gene contexts. + """ + + connected_components = nx.connected_components(context_graph) + + # Connected component graph Filtering + + # remove singleton famillies + connected_components = (component for component in connected_components if len(component) > 1) + + # remove component made only of famillies not initially requested + connected_components = (component for component in connected_components if component & families_of_interest) + + gene_contexts = set() + for i, component in enumerate(connected_components): + + family_of_interest_of_gc = component & families_of_interest + gene_context = GeneContext(gc_id=i, families=component, families_of_interest=family_of_interest_of_gc) + + graph_of_gc = nx.subgraph_view(context_graph, filter_node=lambda n: n in component) # .copy() + nx.set_node_attributes(graph_of_gc, i, name="gene_context_id") + gene_context.add_context_graph(graph_of_gc) + + gene_contexts.add(gene_context) + + return gene_contexts -def write_graph(context_graph: nx.Graph, output_dir: str, famillies_of_interest: Set[GeneFamily], gene_contexts:List[GeneContext]): + +def write_graph(gene_contexts:List[GeneContext], output_dir: str, families_of_interest: Set[GeneFamily], graph_format:List[str]): """ Write a graph to file with node and edge attributes. - This function writes a graph to a file in the GraphML format or in GEXF format. The original context graph contains + This function writes a graph to a file in the GraphML format or/and in GEXF format. The original context graph contains ppanggolin objects as nodes and lists and dictionaries in edge attributes. Since these objects cannot be written to the output graph, this function creates a new graph that contains only writable objects. - :param context_graph: A NetworkX Graph object representing the graph. + :param gene_contexts: List of gene context. it includes graph of the context :param output_dir: The output directory where the graph file will be written. - :param famillies_of_interest: A list of node objects that are of interest. - :param gene_contexts: List of gene context, used to add context id to node of the graph + :param families_of_interest: A list of node objects that are of interest. + :param graph_format: List of formats of the output graph. Can be graphml or gexf """ def filter_attribute(data:dict): @@ -182,38 +209,36 @@ def filter_attribute(data:dict): G = nx.Graph() - G.add_edges_from((f1.name, f2.name, filter_attribute(d)) for f1, f2, d in context_graph.edges(data=True)) + for gc in gene_contexts: + G.add_edges_from((f1.name, f2.name, filter_attribute(d)) for f1, f2, d in gc.graph.edges(data=True)) - # convert transitivity dict to str - edges_with_transitivity_str = {(f1.name, f2.name):str(d['transitivity']) for f1, f2, d in context_graph.edges(data=True)} - - nx.set_edge_attributes(G, edges_with_transitivity_str, name="transitivity") + # convert transitivity dict to str + edges_with_transitivity_str = {(f1.name, f2.name):str(d['transitivity']) for f1, f2, d in gc.graph.edges(data=True)} - nodes_attributes_filtered = {f.name:filter_attribute(d) for f,d in context_graph.nodes(data=True)} - - # on top of attributes already contained in node of context graph - # add organisms and genes count that have the family, the partition and if the family was in initially requested - nodes_family_data = {f.name:{"organisms":len(f.organisms), - "partition":f.partition, - "genes":len(f.genes), - "famillies_of_interest": f in famillies_of_interest} for f in context_graph.nodes()} + nx.set_edge_attributes(G, edges_with_transitivity_str, name="transitivity") - family_name_to_context_id = {family.name:context.ID for context in gene_contexts for family in context.families} - - for f, d in G.nodes(data=True): - d.update(nodes_family_data[f]) - d.update(nodes_attributes_filtered[f]) - d['context_id'] = family_name_to_context_id[f] + nodes_attributes_filtered = {f.name:filter_attribute(d) for f,d in gc.graph.nodes(data=True)} + + # on top of attributes already contained in node of context graph + # add organisms and genes count that have the family, the partition and if the family was in initially requested + nodes_family_data = {f.name:{"organisms": len(f.organisms), + "partition": f.named_partition, + "genes": len(f.genes), + "families_of_interest": f in families_of_interest} for f in gc.graph.nodes()} + for f, d in G.nodes(data=True): + d.update(nodes_family_data[f]) + d.update(nodes_attributes_filtered[f]) - - graphml_file = os.path.join(output_dir, "graph_context.graphml") - logging.info(f'Writting context graph in {graphml_file}') - nx.write_graphml_lxml(G, graphml_file) + if "graphml" in graph_format: + graphml_file = os.path.join(output_dir, "graph_context.graphml") + logging.info(f'Writting context graph in {graphml_file}') + nx.write_graphml_lxml(G, graphml_file) - gexf_file = os.path.join(output_dir, "graph_context.gexf") - logging.info(f'Writting context graph in {gexf_file}') - nx.readwrite.gexf.write_gexf(G, gexf_file) + if "gexf" in graph_format: + gexf_file = os.path.join(output_dir, "graph_context.gexf") + logging.info(f'Writting context graph in {gexf_file}') + nx.readwrite.gexf.write_gexf(G, gexf_file) def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff: float) -> None: @@ -610,7 +635,7 @@ def parser_context(parser: argparse.ArgumentParser): required = parser.add_argument_group(title="Required arguments", description="All of the following arguments are required :") required.add_argument('-p', '--pangenome', required=False, type=str, help="The pangenome.h5 file") - required.add_argument('-o', '--output', required=True, type=str, + required.add_argument('-o', '--output', required=False, type=str, help="Output directory where the file(s) will be written") onereq = parser.add_argument_group(title="Input file", description="One of the following argument is required :") onereq.add_argument('-S', '--sequences', required=False, type=str, diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 7209091a..f7f40c7f 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -7,8 +7,8 @@ from collections.abc import Iterable # installed libraries -from typing import Dict - +from typing import Dict, Set +import networkx as nx import gmpy2 # local libraries @@ -431,26 +431,49 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): class GeneContext: """ - A class used to represent a gene context + A class used to represent a gene context. + + :param gc_id: Identifier of the gene context. + :param families: Gene families related to the gene context. + :param families_of_interest: Input families for which the context is being searched. - :param gc_id : identifier of the Gene context - :param families: Gene families related to the GeneContext + Gene contexts are used to represent a set of gene families and their relationships. """ - def __init__(self, gc_id: int, families: set = None): + def __init__(self, gc_id: int, families: Set[GeneFamily] = None, families_of_interest: Set[GeneFamily] = None): self.ID = gc_id self.families = set() + self.families_of_interest = families_of_interest + self.graph = None if families is not None: if not all(isinstance(fam, GeneFamily) for fam in families): - raise Exception("You provided elements that were not GeneFamily object." - " GeneContext are only made of GeneFamily") + raise Exception("You provided elements that were not GeneFamily objects. " + "GeneContexts are only made of GeneFamily objects.") self.families |= set(families) def add_family(self, family: GeneFamily): """ - Allow to add one family in the GeneContext - :param family: family to add + Add a gene family to the gene context. + + :param family: The gene family to add. """ if not isinstance(family, GeneFamily): - raise Exception("You did not provide a GenFamily object. Modules are only made of GeneFamily") + raise Exception("You did not provide a GeneFamily object. " + "GeneContexts are only made of GeneFamily objects.") self.families.add(family) + + def __len__(self) -> int: + """ + Get length of a context graph by returning the number of gene families it includes. + + :return: number of family in the gene context + """ + return len(self.families) + + def add_context_graph(self, graph: nx.Graph): + """ + Add a context graph to the gene context. + + :param graph: The context graph. + """ + self.graph = graph \ No newline at end of file From 28e1db2d410bf8e0a3a018355679b9075140fb95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <39793176+jpjarnoux@users.noreply.github.com> Date: Wed, 21 Jun 2023 13:29:06 +0200 Subject: [PATCH 019/173] Update main.yml Add -f option to test config --- .github/workflows/main.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 04657d3b..9c6a9684 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -106,5 +106,4 @@ jobs: run: | cd testingDataset ppanggolin utils --default_config panrgp -o panrgp_default_config.yaml - ppanggolin panrgp --anno organisms.gbff.list --cluster clusters.tsv -o test_config --config panrgp_default_config.yaml - + ppanggolin panrgp --anno organisms.gbff.list --cluster clusters.tsv -o test_config --config panrgp_default_config.yaml -f From 763202d3c6318f73dbd3e2d1cfb270248a3c030e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <39793176+jpjarnoux@users.noreply.github.com> Date: Wed, 21 Jun 2023 14:10:27 +0200 Subject: [PATCH 020/173] Update main.yml --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 9c6a9684..4d843152 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -105,5 +105,5 @@ jobs: shell: bash -l {0} run: | cd testingDataset - ppanggolin utils --default_config panrgp -o panrgp_default_config.yaml + ppanggolin utils --default_config panrgp -o panrgp_default_config.yaml -f ppanggolin panrgp --anno organisms.gbff.list --cluster clusters.tsv -o test_config --config panrgp_default_config.yaml -f From 690317e2afa187ca5c2b8c64bd7da4d752b6f27b Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 22 Jun 2023 16:34:44 +0200 Subject: [PATCH 021/173] change graph writing to make panorama interfacing easier --- ppanggolin/context/searchGeneContext.py | 118 +++++++++++++++--------- 1 file changed, 73 insertions(+), 45 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 515a9aeb..95ef6675 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -49,13 +49,6 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: :param disable_bar: Allow preventing bar progress print """ - # check statuses and load info - if sequences is not None and pangenome.status["geneFamilySequences"] not in ["inFile", "Loaded", "Computed"]: - raise Exception("Cannot use this function as your pangenome does not have gene families representatives " - "associated to it. For now this works only if the clustering has been made by PPanGGOLiN.") - - check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) - families_of_interest = set() fam_2_seq = None @@ -98,15 +91,20 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: # Filter graph filter_flag = f'is_jaccard_gene_>_{jaccard_threshold}' - filtered_graph = nx.subgraph_view(gene_context_graph, filter_edge=lambda n1, n2: gene_context_graph[n1][n2][filter_flag] ) + edges_to_remove = [(n,v) for n,v,d in gene_context_graph.edges(data=True) if not d[filter_flag]] + gene_context_graph.remove_edges_from(edges_to_remove) + + # filtered_context_graph = nx.subgraph_view(gene_context_graph, filter_edge=lambda n1, n2: gene_context_graph[n1][n2][filter_flag] ) logging.getLogger().debug(f"Filtering context graph on {filter_flag}") - logging.getLogger().debug(f"Context graph made of {nx.number_of_nodes(filtered_graph)} nodes and {nx.number_of_edges(filtered_graph)} edges") + logging.getLogger().debug(f"Context graph made of {nx.number_of_nodes(gene_context_graph)} nodes and {nx.number_of_edges(gene_context_graph)} edges") - gene_contexts = get_gene_contexts(filtered_graph, families_of_interest) + gene_contexts = get_gene_contexts(gene_context_graph, families_of_interest) - if write_context_graph: - write_graph(gene_contexts, output, families_of_interest, graph_format=['graphml', "gexf"]) + + gene_context_graph = make_graph_writable(gene_context_graph) + print(f"WRITING gene context graph in {output}") + write_graph(gene_context_graph, output, graph_format=['graphml', "gexf"]) if len(gene_contexts) != 0: logging.getLogger().debug(f"There are {sum((len(gc) for gc in gene_contexts))} families among {len(gene_contexts)} gene contexts") @@ -120,7 +118,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: logging.getLogger().info(f"Computing gene contexts took {round(time.time() - start_time, 2)} seconds") - return gene_contexts + return gene_context_graph # # Finding connected components with panmodule functions # # extract the modules from the graph @@ -169,35 +167,46 @@ def get_gene_contexts(context_graph: nx.Graph, families_of_interest: Set[GeneFam connected_components = (component for component in connected_components if component & families_of_interest) gene_contexts = set() + families_in_context = set() + for i, component in enumerate(connected_components): - + families_in_context |= component family_of_interest_of_gc = component & families_of_interest gene_context = GeneContext(gc_id=i, families=component, families_of_interest=family_of_interest_of_gc) - - graph_of_gc = nx.subgraph_view(context_graph, filter_node=lambda n: n in component) # .copy() - nx.set_node_attributes(graph_of_gc, i, name="gene_context_id") - gene_context.add_context_graph(graph_of_gc) + + # nx.set_node_attributes(context_graph, i, name="gene_context_id") + # add gc id to node attribute + node_attributes = {n:{"gene_context_id":i, "families_of_interest": n in families_of_interest} for n in component} + nx.set_node_attributes(context_graph, node_attributes) + + # for n, attribute in context_graph.nodes(data=True): + # if n in component: + # attribute['gene_context_id'] = i + # attribute['families_of_interest'] = n in families_of_interest + # print(context_graph.get_node_attributes(n, 'families_of_interest') ) + # # gene_context.add_context_graph(graph_of_gc) gene_contexts.add(gene_context) + + node_not_in_context = set(context_graph.nodes()) - families_in_context + context_graph.remove_nodes_from(node_not_in_context) return gene_contexts -def write_graph(gene_contexts:List[GeneContext], output_dir: str, families_of_interest: Set[GeneFamily], graph_format:List[str]): +def make_graph_writable(context_graph): + """ - Write a graph to file with node and edge attributes. - - This function writes a graph to a file in the GraphML format or/and in GEXF format. The original context graph contains + + The original context graph contains ppanggolin objects as nodes and lists and dictionaries in edge attributes. Since these objects cannot be written to the output graph, this function creates a new graph that contains only writable objects. :param gene_contexts: List of gene context. it includes graph of the context - :param output_dir: The output directory where the graph file will be written. - :param families_of_interest: A list of node objects that are of interest. - :param graph_format: List of formats of the output graph. Can be graphml or gexf """ + def filter_attribute(data:dict): """ Helper function to filter the edge attributes. @@ -206,30 +215,40 @@ def filter_attribute(data:dict): :return: A filtered dictionary containing only non-collection attributes. """ return {k:v for k, v in data.items() if type(v) not in [set, dict, list]} - + G = nx.Graph() - for gc in gene_contexts: - G.add_edges_from((f1.name, f2.name, filter_attribute(d)) for f1, f2, d in gc.graph.edges(data=True)) - + G.add_edges_from((f1.name, f2.name, filter_attribute(d)) for f1, f2, d in context_graph.edges(data=True)) - # convert transitivity dict to str - edges_with_transitivity_str = {(f1.name, f2.name):str(d['transitivity']) for f1, f2, d in gc.graph.edges(data=True)} - - nx.set_edge_attributes(G, edges_with_transitivity_str, name="transitivity") + + # convert transitivity dict to str + edges_with_transitivity_str = {(f1.name, f2.name):str(d['transitivity']) for f1, f2, d in context_graph.edges(data=True)} + + nx.set_edge_attributes(G, edges_with_transitivity_str, name="transitivity") + + nodes_attributes_filtered = {f.name:filter_attribute(d) for f,d in context_graph.nodes(data=True)} + + # on top of attributes already contained in node of context graph + # add organisms and genes count that have the family, the partition and if the family was in initially requested + nodes_family_data = {f.name:{"organisms": len(f.organisms), + "partition": f.named_partition, + "genes": len(f.genes)} for f in context_graph.nodes()} - nodes_attributes_filtered = {f.name:filter_attribute(d) for f,d in gc.graph.nodes(data=True)} - - # on top of attributes already contained in node of context graph - # add organisms and genes count that have the family, the partition and if the family was in initially requested - nodes_family_data = {f.name:{"organisms": len(f.organisms), - "partition": f.named_partition, - "genes": len(f.genes), - "families_of_interest": f in families_of_interest} for f in gc.graph.nodes()} - for f, d in G.nodes(data=True): - d.update(nodes_family_data[f]) - d.update(nodes_attributes_filtered[f]) - + for f, d in G.nodes(data=True): + d.update(nodes_family_data[f]) + d.update(nodes_attributes_filtered[f]) + + return G + +def write_graph(G:nx.Graph, output_dir: str, graph_format:List[str]): + """ + Write a graph to file in the GraphML format or/and in GEXF format. + + :param output_dir: The output directory where the graph file will be written. + :param graph_format: List of formats of the output graph. Can be graphml or gexf + + """ + if "graphml" in graph_format: graphml_file = os.path.join(output_dir, "graph_context.graphml") logging.info(f'Writting context graph in {graphml_file}') @@ -605,6 +624,15 @@ def launch(args: argparse.Namespace): mk_outdir(args.output, args.force) pangenome = Pangenome() pangenome.add_file(args.pangenome) + + # check statuses and load info + if args.sequences is not None and pangenome.status["geneFamilySequences"] not in ["inFile", "Loaded", "Computed"]: + raise Exception("Cannot use this function as your pangenome does not have gene families representatives " + "associated to it. For now this works only if the clustering has been made by PPanGGOLiN.") + + check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar) + + search_gene_context_in_pangenome(pangenome=pangenome, output=args.output, tmpdir=args.tmpdir, sequences=args.sequences, families=args.family, transitive=args.transitive, identity=args.identity, coverage=args.coverage, jaccard_threshold=args.jaccard, window_size=args.window_size, From c8faaab2c8ee9269948764e919a70ceb1f9cc975 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 23 Jun 2023 11:45:51 +0200 Subject: [PATCH 022/173] make transitivity starts at 0. 0 mean no gap between adj genes. --- ppanggolin/context/searchGeneContext.py | 34 ++++++------------------- tests/context/test_context.py | 10 ++++---- 2 files changed, 13 insertions(+), 31 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 95ef6675..78b16083 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -29,7 +29,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: str, sequences: str = None, families: str = None, transitive: int = 4, identity: float = 0.5, coverage: float = 0.8, jaccard_threshold: float = 0.85, window_size: int = 1, no_defrag: bool = False, - cpu: int = 1, write_context_graph:bool = False, disable_bar=True): + cpu: int = 1, disable_bar=True): """ Main function to search common gene contexts between sequence set and pangenome families @@ -103,7 +103,6 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: gene_context_graph = make_graph_writable(gene_context_graph) - print(f"WRITING gene context graph in {output}") write_graph(gene_context_graph, output, graph_format=['graphml', "gexf"]) if len(gene_contexts) != 0: @@ -301,7 +300,7 @@ def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff: float) def add_edges_to_context_graph(context_graph: nx.Graph, contig_genes: Iterable[Gene], contig_windows: List[Tuple[int, int]], - t: int, + transitivity: int, is_circular: bool): """ Add edges to the context graph based on contig genes and windows. @@ -309,14 +308,14 @@ def add_edges_to_context_graph(context_graph: nx.Graph, :param context_graph: The context graph to which edges will be added. :param contig_genes: An iterable of genes in the contig. :param contig_windows: A list of tuples representing the start and end positions of contig windows. - :param t: The number of next genes to consider when adding edges. + :param transitivity: The number of next genes to consider when adding edges. :param is_circular: A boolean indicating if the contig is circular. """ for window_start, window_end in contig_windows: for gene_index in range(window_start, window_end + 1): gene = contig_genes[gene_index] - next_genes = get_n_next_genes_index(gene_index, next_genes_count=t, + next_genes = get_n_next_genes_index(gene_index, next_genes_count=transitivity+1, contig_size=len(contig_genes), is_circular=is_circular) next_genes = list(next_genes) @@ -342,7 +341,7 @@ def add_edges_to_context_graph(context_graph: nx.Graph, # Store information of the transitivity used to link the two genes: if "transitivity" not in edge_dict: - edge_dict['transitivity'] = {i:0 for i in range(t +1)} + edge_dict['transitivity'] = {i:0 for i in range(transitivity +1)} edge_dict['transitivity'][i] += 1 @@ -541,23 +540,6 @@ def compute_gene_context_graph(families: Iterable[GeneFamily], transitive: int = return context_graph -# def compute_gene_context(g: nx.Graph, jaccard: float = 0.85) -> set: -# """ -# Compute the gene contexts in the graph - -# :param g: Graph of gene contexts between interesting gene families of the pan -# :param jaccard: Jaccard index - -# :return: Set of gene contexts find in graph -# """ - -# gene_contexts = set() -# c = 1 -# for comp in connected_components(g, removed=set(), weight=jaccard): -# gene_contexts.add(GeneContext(gc_id=c, families=comp)) -# c += 1 -# return gene_contexts - def fam2seq(seq_to_pan: dict) -> dict: """ @@ -636,7 +618,7 @@ def launch(args: argparse.Namespace): search_gene_context_in_pangenome(pangenome=pangenome, output=args.output, tmpdir=args.tmpdir, sequences=args.sequences, families=args.family, transitive=args.transitive, identity=args.identity, coverage=args.coverage, jaccard_threshold=args.jaccard, window_size=args.window_size, - no_defrag=args.no_defrag, cpu=args.cpu, write_context_graph=args.write_graph, disable_bar=args.disable_prog_bar) + no_defrag=args.no_defrag, cpu=args.cpu, disable_bar=args.disable_prog_bar) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -690,8 +672,8 @@ def parser_context(parser: argparse.ArgumentParser): optional.add_argument("-s", "--jaccard", required=False, type=restricted_float, default=0.85, help="minimum jaccard similarity used to filter edges between gene families. Increasing it " "will improve precision but lower sensitivity a lot.") - optional.add_argument('--write_graph', action="store_true", - help="Write context graph in GEXF format.") + # optional.add_argument('--write_graph', action="store_true", + # help="Write context graph in GEXF format.") optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") if __name__ == '__main__': diff --git a/tests/context/test_context.py b/tests/context/test_context.py index 1fd45d62..014ef334 100644 --- a/tests/context/test_context.py +++ b/tests/context/test_context.py @@ -106,7 +106,7 @@ def test_add_edges_to_context_graph(simple_contig): add_edges_to_context_graph(context_graph, contig_genes = simple_contig.genes, contig_windows = [(0,3)], - t=2, + transitivity=1, is_circular=simple_contig.is_circular) nodes = sorted([n.name for n in context_graph.nodes()]) @@ -127,7 +127,7 @@ def test_add_edges_to_context_graph_2(simple_contig): add_edges_to_context_graph(context_graph, contig_genes = simple_contig.genes, contig_windows = [(1,3)], - t=1, + transitivity=0, is_circular=simple_contig.is_circular) nodes = sorted([n.name for n in context_graph.nodes()]) @@ -149,7 +149,7 @@ def test_add_edges_to_context_graph_linear(simple_contig): add_edges_to_context_graph(context_graph, contig_genes = simple_contig.genes, contig_windows = [(4,5), (0,2)], - t=1, + transitivity=0, is_circular=False) nodes = sorted([n.name for n in context_graph.nodes()]) @@ -173,7 +173,7 @@ def test_add_edges_to_context_graph_circular(simple_contig): add_edges_to_context_graph(context_graph, contig_genes = simple_contig.genes, contig_windows = [(4,5), (0,2)], - t=1, + transitivity=0, is_circular=True) nodes = sorted([n.name for n in context_graph.nodes()]) @@ -200,7 +200,7 @@ def test_compute_gene_context_graph(simple_contig): families_of_interest = {f for f in families_in_contigs if f.name in family_names_of_interest } context_graph = compute_gene_context_graph(families_of_interest, - transitive=1, + transitive=0, window_size = 2) nodes = sorted([n.name for n in context_graph.nodes()]) edges = {tuple(sorted([n.name, v.name])) for n, v in context_graph.edges()} From 50aa4d5e0c42e8b4ae0e932594db0a56bb657904 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 26 Jun 2023 17:38:36 +0200 Subject: [PATCH 023/173] change output of the function to fit panorama needs --- ppanggolin/context/searchGeneContext.py | 36 +++++++++++++------------ 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 78b16083..32584328 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -29,7 +29,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: str, sequences: str = None, families: str = None, transitive: int = 4, identity: float = 0.5, coverage: float = 0.8, jaccard_threshold: float = 0.85, window_size: int = 1, no_defrag: bool = False, - cpu: int = 1, disable_bar=True): + cpu: int = 1, graph_format:str = "graphml", disable_bar=True): """ Main function to search common gene contexts between sequence set and pangenome families @@ -45,7 +45,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: :param window_size: Number of genes to consider in the gene context. :param no_defrag: do not use the defrag workflow if true :param cpu: Number of core used to process - :param write_context_graph: Write graph of the contexts + :param graph_format: Write format of the context graph. Can be graphml or gexf :param disable_bar: Allow preventing bar progress print """ @@ -103,7 +103,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: gene_context_graph = make_graph_writable(gene_context_graph) - write_graph(gene_context_graph, output, graph_format=['graphml', "gexf"]) + out_graph_file = write_graph(gene_context_graph, output, graph_format) if len(gene_contexts) != 0: logging.getLogger().debug(f"There are {sum((len(gc) for gc in gene_contexts))} families among {len(gene_contexts)} gene contexts") @@ -117,7 +117,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: logging.getLogger().info(f"Computing gene contexts took {round(time.time() - start_time, 2)} seconds") - return gene_context_graph + return gene_context_graph, out_graph_file # # Finding connected components with panmodule functions # # extract the modules from the graph @@ -239,25 +239,28 @@ def filter_attribute(data:dict): return G -def write_graph(G:nx.Graph, output_dir: str, graph_format:List[str]): +def write_graph(G:nx.Graph, output_dir: str, graph_format:str): """ Write a graph to file in the GraphML format or/and in GEXF format. :param output_dir: The output directory where the graph file will be written. - :param graph_format: List of formats of the output graph. Can be graphml or gexf + :param graph_format: Formats of the output graph. Can be graphml or gexf """ - if "graphml" in graph_format: - graphml_file = os.path.join(output_dir, "graph_context.graphml") - logging.info(f'Writting context graph in {graphml_file}') - nx.write_graphml_lxml(G, graphml_file) + if "graphml" == graph_format: + out_file = os.path.join(output_dir, "graph_context.graphml") + logging.info(f'Writting context graph in {out_file}') + nx.write_graphml_lxml(G, out_file) - if "gexf" in graph_format: - gexf_file = os.path.join(output_dir, "graph_context.gexf") - logging.info(f'Writting context graph in {gexf_file}') - nx.readwrite.gexf.write_gexf(G, gexf_file) + elif "gexf" == graph_format: + out_file = os.path.join(output_dir, "graph_context.gexf") + logging.info(f'Writting context graph in {out_file}') + nx.readwrite.gexf.write_gexf(G, out_file) + else: + raise ValueError(f'The given graph format ({graph_format}) is not correct. it should be "graphml" or gexf') + return out_file def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff: float) -> None: """ @@ -618,7 +621,7 @@ def launch(args: argparse.Namespace): search_gene_context_in_pangenome(pangenome=pangenome, output=args.output, tmpdir=args.tmpdir, sequences=args.sequences, families=args.family, transitive=args.transitive, identity=args.identity, coverage=args.coverage, jaccard_threshold=args.jaccard, window_size=args.window_size, - no_defrag=args.no_defrag, cpu=args.cpu, disable_bar=args.disable_prog_bar) + no_defrag=args.no_defrag, cpu=args.cpu, disable_bar=args.disable_prog_bar, graph_format=args.graph_format) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -672,8 +675,7 @@ def parser_context(parser: argparse.ArgumentParser): optional.add_argument("-s", "--jaccard", required=False, type=restricted_float, default=0.85, help="minimum jaccard similarity used to filter edges between gene families. Increasing it " "will improve precision but lower sensitivity a lot.") - # optional.add_argument('--write_graph', action="store_true", - # help="Write context graph in GEXF format.") + optional.add_argument('--graph_format', help="Format of the context graph. Can be gexf or graphml.", default='graphml', choices=['gexf','graphml']) optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") if __name__ == '__main__': From f342d84183c0ad53a00ba81e9ca38786017d97fa Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 3 Jul 2023 17:14:58 +0200 Subject: [PATCH 024/173] add projection command up to project partition --- ppanggolin/align/alignOnPang.py | 103 +++++++++++++-------- ppanggolin/formats/writeSequences.py | 5 +- ppanggolin/main.py | 8 +- ppanggolin/projection/projection.py | 133 +++++++++++++++++++++++---- ppanggolin/utils.py | 2 +- 5 files changed, 187 insertions(+), 64 deletions(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 7b43a11b..96941503 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -37,7 +37,7 @@ def createdb(file_obj: TextIOWrapper, tmpdir: tempfile.TemporaryDirectory) -> Te def align_seq_to_pang(pang_file: TextIOWrapper, seq_file: TextIOWrapper, output: str, tmpdir: tempfile.TemporaryDirectory, cpu: int = 1, no_defrag: bool = False, - identity: float = 0.8, coverage: float = 0.8) -> str: + identity: float = 0.8, coverage: float = 0.8, is_protein:bool = False, translation_table: int = None ) -> str: """ Align pangenome sequences against fasta sequence @@ -49,53 +49,75 @@ def align_seq_to_pang(pang_file: TextIOWrapper, seq_file: TextIOWrapper, output: :param no_defrag: Allow to pass the defragmentation step :param identity: minimal identity threshold for the alignment :param coverage: minimal identity threshold for the alignment + :param is_protein: Is the sequence file are protein sequences. If True, the sequences are not translated by mmseqs + :param translation_table: Translation table to use, if sequences are nucleotide and need to be translated. :return: Alignement result file """ pang_db = createdb(pang_file, tmpdir) seq_db = createdb(seq_file, tmpdir) - cov_mode = "0" # coverage of query and target - if not no_defrag: - cov_mode = "1" # coverage of target - aln_db = tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.name) - cmd = ["mmseqs", "search", seq_db.name, pang_db.name, aln_db.name, tmpdir.name, "-a", "--min-seq-id", str(identity), - "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu)] - logging.getLogger().debug(" ".join(cmd)) + + cov_mode = "1" # coverage of target + if no_defrag: + cov_mode = "0" # coverage of query and target + + with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.name, prefix="aln_result_db_file") as aln_db: + cmd = ["mmseqs", "search", seq_db.name, pang_db.name, aln_db.name, tmpdir.name, "-a", "--min-seq-id", str(identity), + "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu)] + if is_protein: + logging.getLogger().debug(f"Input sequences will be translated by mmseqs with translation table {translation_table}") + cmd += ["--translation-table", f"{translation_table}", "--translate", "0" ] + + logging.getLogger().info("Aligning sequences to cluster representatives...") - subprocess.run(cmd, stdout=subprocess.DEVNULL) - outfile = output + "/input_to_pangenome_associations.blast-tab_tmp" # write a tmp file of the results - cmd = ["mmseqs", "convertalis", seq_db.name, pang_db.name, aln_db.name, outfile, "--format-mode", "2"] logging.getLogger().debug(" ".join(cmd)) - logging.getLogger().info("Extracting alignments...") - subprocess.run(cmd, stdout=subprocess.DEVNULL) + subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + + with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.name, prefix="aln_result_db_file", suffix = ".tsv", delete=False) as outfile: + cmd = ["mmseqs", "convertalis", seq_db.name, pang_db.name, aln_db.name, outfile.name, "--format-mode", "2"] + + logging.getLogger().info("Extracting alignments...") + logging.getLogger().debug(" ".join(cmd)) + subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + + pang_db.close() seq_db.close() aln_db.close() - return outfile + return outfile.name -def read_alignments(aln_res: str, pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], str]: +def associate_input_seq_to_gene_family_from_aln(aln_res: str, outdir:TextIOWrapper, pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], str]: """ - Read alignment result to link input sequence to pangenome + Read alignment result to link input sequence to pangenome gene family :param aln_res: Alignement result file + :param outdir: Output directory :param pangenome: Input pangenome - :return: Dictionnary with sequence link to pangenome gene families and actual name of resulting alignment file + :return: Dictionnary with sequence link to pangenome gene families and actual path to the cleaned alignment file """ seq2pang = {} - outname = open(aln_res.replace("_tmp", ""), "w") # write the actual result file - with open(aln_res, "r") as alnFile: + result_file = outdir + f"/alignment_input_seqs_to_pangenome_gene_families.tsv" # write the actual result file + logging.getLogger(f'Get write alignment file in {result_file}') + + with open(aln_res, "r") as alnFile, open(result_file, "w") as outfile : for line in alnFile: - line = line.replace("ppanggolin_", "") # remove the 'ppanggolin_' bit of the id - outname.write(line) - line = line.split() - if seq2pang.get(line[0]) is None: # if no results were found yet - seq2pang[line[0]] = pangenome.get_gene_family(line[1]) # then the best hit is the first one we see. - outname.close() - return seq2pang, outname.name + line_splitted = line.split() + + + line_splitted[1] = line_splitted[1].replace("ppanggolin_", "") # remove the 'ppanggolin_' bit of the id + + outfile.write("\t".join(line_splitted)) + + input_seq_id, gene_family_id = line_splitted[0:2] + + if seq2pang.get(input_seq_id) is None: # if no results were found yet + seq2pang[input_seq_id] = pangenome.get_gene_family(gene_family_id) # then the best hit is the first one we see. + + return seq2pang, outfile def get_seq(seq_file: TextIOWrapper) -> Set[str]: @@ -115,7 +137,7 @@ def get_seq(seq_file: TextIOWrapper) -> Set[str]: def write_gene_fam_sequences(pangenome: Pangenome, file_obj: TextIOWrapper, add: str = ""): """ - Export the sequence of genes in families + Export the sequence of gene families :param pangenome: Pangenome containing families :param file_obj: Temporary file where sequences will be written @@ -124,7 +146,7 @@ def write_gene_fam_sequences(pangenome: Pangenome, file_obj: TextIOWrapper, add: for fam in pangenome.gene_families: file_obj.write(">" + add + fam.name + "\n") file_obj.write(fam.sequence + "\n") - file_obj.flush() + # file_obj.flush() def project_partition(seq_to_pang: Dict[str, GeneFamily], seq_set: Set[str], output: str) -> str: @@ -140,8 +162,8 @@ def project_partition(seq_to_pang: Dict[str, GeneFamily], seq_set: Set[str], out partition_proj = output + "/sequences_partition_projection.tsv" with open(partition_proj, "w") as partProjFile: - for key, pangFam in seq_to_pang.items(): - partProjFile.write(key + "\t" + pangFam.named_partition + "\n") + for input_seq, pangFam in seq_to_pang.items(): + partProjFile.write(input_seq + "\t" + pangFam.named_partition + "\n") for remainingSeq in (seq_to_pang.keys() & seq_set): partProjFile.write(remainingSeq + "\tcloud\n") # if there is no hit, it's going to be cloud genes. return partition_proj @@ -270,7 +292,7 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: str, draw_rela def get_seq2pang(pangenome: Pangenome, sequence_file: str, output: str, tmpdir: tempfile.TemporaryDirectory, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, - coverage: float = 0.8) -> Tuple[set, str, dict]: + coverage: float = 0.8, is_protein:bool = True, translation_table:int = 11) -> Tuple[set, str, dict]: """ Assign a pangenome gene family to the input sequences. @@ -282,20 +304,21 @@ def get_seq2pang(pangenome: Pangenome, sequence_file: str, output: str, tmpdir: :param no_defrag: do not use the defrag workflow if true :param identity: minimal identity threshold for the alignment :param coverage: minimal identity threshold for the alignment + :param is_protein: Is the sequence file are protein sequences. If True, the sequences are not translated by mmseqs + :param translation_table: Translation table to use, if sequences are nucleotide and need to be translated. :return: sequence set, blast-tab result file string, and sequences aligned with families - """ - tmp_pang_file = tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.name) - - write_gene_fam_sequences(pangenome, tmp_pang_file, add="ppanggolin_") + """ + + with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.name, delete=False, suffix=".faa") as tmp_pang_file: - with read_compressed_or_not(sequence_file) as seqFileObj: - seq_set = get_seq(seqFileObj) - align_file = align_seq_to_pang(tmp_pang_file, seqFileObj, output, tmpdir, cpu, no_defrag, identity, coverage) + write_gene_fam_sequences(pangenome, tmp_pang_file, add="ppanggolin_") - seq2pang, align_file = read_alignments(align_file, pangenome) + with read_compressed_or_not(sequence_file) as seqFileObj: + seq_set = get_seq(seqFileObj) + align_file = align_seq_to_pang(tmp_pang_file, seqFileObj, output, tmpdir, cpu, no_defrag, identity, coverage, is_protein, translation_table ) - tmp_pang_file.close() + seq2pang, align_file = associate_input_seq_to_gene_family_from_aln(align_file, output, pangenome) return seq_set, align_file, seq2pang diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 100d4b96..9d601d11 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -21,6 +21,7 @@ def write_gene_sequences_from_annotations(pangenome: Pangenome, file_obj: TextIO, list_cds: list = None, add: str = '', + seq_attr_to_write: str = "dna" , disable_bar: bool = False): """ Writes the CDS sequences given through list_CDS of the Pangenome object to a tmpFile object, @@ -33,6 +34,8 @@ def write_gene_sequences_from_annotations(pangenome: Pangenome, file_obj: TextIO :param add: Add prefix to gene ID :param disable_bar: Disable progress bar """ + assert seq_attr_to_write in ['dna', "protein"] + counter = 0 if list_cds is None: list_cds = pangenome.genes @@ -41,7 +44,7 @@ def write_gene_sequences_from_annotations(pangenome: Pangenome, file_obj: TextIO if gene.type == "CDS": counter += 1 file_obj.write('>' + add + gene.ID + "\n") - file_obj.write(gene.dna + "\n") + file_obj.write(getattr(gene, seq_attr_to_write) + "\n") file_obj.flush() diff --git a/ppanggolin/main.py b/ppanggolin/main.py index 440f0ec5..aa7364ee 100644 --- a/ppanggolin/main.py +++ b/ppanggolin/main.py @@ -55,7 +55,7 @@ def cmd_line() -> argparse.Namespace: desc += " partition Partition the pangenome graph\n" desc += " rarefaction Compute the rarefaction curve of the pangenome\n" desc += " msa Compute Multiple Sequence Alignments for pangenome gene families\n" - desc += " projection Projet a new genome to an existing pangenome\n" + desc += " projection Annotate an input genome with an existing pangenome\n" desc += " \n" desc += " Output:\n" desc += " draw Draw figures representing the pangenome through different aspects\n" @@ -130,8 +130,8 @@ def cmd_line() -> argparse.Namespace: cmds_pangenome_required = ["cluster", "info", "module", "graph","align", "context", "write", "msa", "draw", "partition", - "rarefaction", "spot", "fasta", "metrics", "rgp"] - if args.subcommand in cmds_pangenome_required and args.pangenome is None: + "rarefaction", "spot", "fasta", "metrics", "rgp", "projection"] + if args.subcommand in cmds_pangenome_required and args.pangenome is None: parser.error("You must provide a pangenome file with the --pangenome " "argument through the command line or the config file.") @@ -184,7 +184,7 @@ def main(): elif args.subcommand == "align": ppanggolin.align.launch(args) elif args.subcommand == "projection": - ppanggolin.align.launch(args) + ppanggolin.projection.projection.launch(args) elif args.subcommand == "rgp": ppanggolin.RGP.genomicIsland.launch(args) elif args.subcommand == "spot": diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index f74d7150..89b13362 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -7,27 +7,120 @@ import logging import os import time +from pathlib import Path +import tempfile + # installed libraries from tqdm import tqdm # # local libraries -# from ppanggolin.annotate.synta import annotate_organism, read_fasta, get_dna_sequence -# from ppanggolin.pangenome import Pangenome -# from ppanggolin.genome import Organism, Gene, RNA, Contig -# from ppanggolin.utils import read_compressed_or_not, mk_file_name, min_one, restricted_float +from ppanggolin.annotate.synta import annotate_organism, read_fasta, get_dna_sequence +from ppanggolin.annotate.annotate import read_anno_file +from ppanggolin.pangenome import Pangenome +# from ppanggolin.genome import input_organism, Gene, RNA, Contig +from ppanggolin.utils import read_compressed_or_not, mk_file_name, min_one, restricted_float, mk_outdir +from ppanggolin.align.alignOnPang import get_seq2pang, project_partition +from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations +from ppanggolin.formats import check_pangenome_info # from ppanggolin.formats import write_pangenome +def annotate_input_genome_with_pangenome(pangenome, input_organism, output, basename, cpu, no_defrag, identity, coverage, tmpdir, + disable_bar, translation_table ): + + """ + + """ + + seq_fasta_file = output / f"{input_organism.name}.fasta" + + with open(seq_fasta_file, "w") as fh_out_faa: + write_gene_sequences_from_annotations(input_organism, fh_out_faa, seq_attr_to_write="dna", + disable_bar=disable_bar) + + # get corresponding gene families + new_tmpdir = tempfile.TemporaryDirectory(dir=tmpdir, prefix="seq_to_pang_tmpdir_") + seq_set, _, seq2pan = get_seq2pang(pangenome, str(seq_fasta_file), str(output), new_tmpdir, cpu, no_defrag, identity=identity, + coverage=coverage, is_protein=False, translation_table=translation_table) + + project_partition(seq2pan, seq_set, str(output)) + +def retrieve_gene_sequences_from_fasta_file(input_organism, fasta_file): + """ + Get gene sequences from fastas + + :param pangenome: input pangenome + :param fasta_file: list of fasta file + """ + + with read_compressed_or_not(fasta_file) as currFastaFile: + contig_id2deq, _ = read_fasta(input_organism, currFastaFile) + + + for contig in input_organism.contigs: + try: + for gene in contig.genes: + gene.add_dna(get_dna_sequence(contig_id2deq[contig.name], gene)) + + for rna in contig.RNAs: + rna.add_dna(get_dna_sequence(contig_id2deq[contig.name], rna)) + except KeyError: + msg = f"Fasta file for input_organism {input_organism.name} did not have the contig {contig.name} " \ + f"that was read from the annotation file. " + msg += f"The provided contigs in the fasta were : " \ + f"{', '.join([contig for contig in contig_id2deq.keys()])}." + raise KeyError(msg) + + def launch(args: argparse.Namespace): """ Command launcher :param args: All arguments provide by user """ - pass - + + output_dir = Path(args.output) + mk_outdir(output_dir, args.force) + + # TODO check that the provided input_organism name is not found in pangenome + # if so add a warning or error + + if args.anno_file is not None: + # read_annotations(pangenome, args.anno, cpu=args.cpu, pseudo=args.use_pseudo, disable_bar=args.disable_prog_bar) + input_organism, has_sequence = read_anno_file(organism_name = args.organism_name, + filename=args.anno_file, + circular_contigs=[], + pseudo=args.use_pseudo) + + if not has_sequence: + if args.fasta_file: + retrieve_gene_sequences_from_fasta_file(input_organism, args.fasta_file) + else: + raise Exception("The gff/gbff provided did not have any sequence information, " + "Thus, we do not have the information we need to continue the projection.") + + elif args.fasta_file is not None: + input_organism = annotate_organism(org_name=args.organism_name, file_name = args.fasta_file, circular_contigs=[], tmpdir=args.tmpdir, + code = args.translation_table, norna=args.norna, kingdom = args.kingdom, + overlap=args.allow_overlap, procedure=args.prodigal_procedure) + + else: + raise Exception("At least one of --fasta_file or --anno_file must be given") + + + # load pangenome + + pangenome = Pangenome() + pangenome.add_file(args.pangenome) + check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar) + + annotate_input_genome_with_pangenome(pangenome, input_organism=input_organism, output=output_dir, basename=args.basename, cpu=args.cpu, + no_defrag = args.no_defrag, identity = args.identity, coverage = args.coverage, tmpdir=args.tmpdir, + disable_bar=args.disable_prog_bar, translation_table = args.translation_table ) + + def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: """ Subparser to launch PPanGGOLiN in Command line @@ -52,7 +145,7 @@ def parser_projection(parser: argparse.ArgumentParser): required.add_argument('-p', '--pangenome', required=False, type=str, help="The pangenome.h5 file") required.add_argument('--organism_name', required=False, type=str, - help="Name of the organism whose genome is being projected onto the provided pangenome.") + help="Name of the input_organism whose genome is being projected onto the provided pangenome.") required.add_argument('--fasta_file', required=False, type=str, help="The filepath of the genomic sequence(s) in FASTA format for the projected genome. " @@ -63,20 +156,24 @@ def parser_projection(parser: argparse.ArgumentParser): "(Annotation file can be compressed with gzip)") # required.add_argument('--fasta', required=False, type=str, - # help="A tab-separated file listing the organism names, and the fasta filepath of its genomic " - # "sequence(s) (the fastas can be compressed with gzip). One line per organism.") + # help="A tab-separated file listing the input_organism names, and the fasta filepath of its genomic " + # "sequence(s) (the fastas can be compressed with gzip). One line per input_organism.") # required.add_argument('--anno', required=False, type=str, - # help="A tab-separated file listing the organism names, and the gff/gbff filepath of its " - # "annotations (the files can be compressed with gzip). One line per organism. " + # help="A tab-separated file listing the input_organism names, and the gff/gbff filepath of its " + # "annotations (the files can be compressed with gzip). One line per input_organism. " # "If this is provided, those annotations will be used.") - - annotate = parser.add_argument_group(title="Annotation arguments") - - annotate.add_argument('-o', '--output', required=False, type=str, + optional = parser.add_argument_group(title="Optional arguments") + optional.add_argument('-o', '--output', required=False, type=str, default="ppanggolin_output" + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", time.localtime()) + "_PID" + str(os.getpid()), help="Output directory") + optional.add_argument("--basename", required=False, default="pangenome", help="basename for the output file") + optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") + + annotate = parser.add_argument_group(title="Annotation arguments") + + annotate.add_argument('--allow_overlap', required=False, action='store_true', default=False, help="Use to not remove genes overlapping with RNA features.") annotate.add_argument("--norna", required=False, action="store_true", default=False, @@ -87,12 +184,13 @@ def parser_projection(parser: argparse.ArgumentParser): "to know which models to use for rRNA annotation.") annotate.add_argument("--translation_table", required=False, type=int, default=11, help="Translation table (genetic code) to use.") - annotate.add_argument("--basename", required=False, default="pangenome", help="basename for the output file") annotate.add_argument("--prodigal_procedure", required=False, type=str.lower, choices=["single", "meta"], default=None, help="Allow to force the prodigal procedure. " "If nothing given, PPanGGOLiN will decide in function of contig length") - + annotate.add_argument("--use_pseudo", required=False, action="store_true", + help="In the context of provided annotation, use this option to read pseudogenes. " + "(Default behavior is to ignore them)") cluster = parser.add_argument_group(title="Clustering arguments") cluster.add_argument('--no_defrag', required=False, action="store_true", @@ -102,5 +200,4 @@ def parser_projection(parser: argparse.ArgumentParser): help="min identity percentage threshold") cluster.add_argument('--coverage', required=False, type=float, default=0.8, help="min coverage percentage threshold") - cluster.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index 8ec780b4..af5511d4 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -25,7 +25,7 @@ from ppanggolin.geneFamily import GeneFamily # all input params that exists in ppanggolin -ALL_INPUT_PARAMS = ['fasta', 'anno', 'clusters', 'pangenome'] +ALL_INPUT_PARAMS = ['fasta', 'anno', 'clusters', 'pangenome', "fasta_file", "annot_file"] # all params that should be in the general_parameters section of the config file ALL_GENERAL_PARAMS = ['output', 'basename', 'rarefaction', 'no_flat_files', 'tmpdir', 'verbose', 'log', From 662185762abb0d2cb8fce8e97558b6c2eb0d130c Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 3 Jul 2023 17:16:19 +0200 Subject: [PATCH 025/173] correct/improve type and docstring --- ppanggolin/annotate/annotate.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 5491a9d4..1994679a 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -7,6 +7,7 @@ import logging import os import time +from typing import Tuple # installed libraries from tqdm import tqdm @@ -94,7 +95,7 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i new_gene.fill_parents(org, contig) -def read_org_gbff(organism: str, gbff_file_path: str, circular_contigs: list, pseudo: bool = False) -> (Organism, bool): +def read_org_gbff(organism: str, gbff_file_path: str, circular_contigs: list, pseudo: bool = False) -> Tuple[Organism, bool]: """ Read a GBFF file and fills Organism, Contig and Genes objects based on information contained in this file @@ -238,7 +239,7 @@ def read_org_gbff(organism: str, gbff_file_path: str, circular_contigs: list, ps return org, True -def read_org_gff(organism: str, gff_file_path: str, circular_contigs, pseudo: bool = False) -> (Organism, bool): +def read_org_gff(organism: str, gff_file_path: str, circular_contigs, pseudo: bool = False) -> Tuple[Organism, bool]: """ Read annotation from GFF file @@ -382,7 +383,7 @@ def launch_read_anno(args: tuple) -> (Organism, bool): return read_anno_file(*args) -def read_anno_file(organism_name: str, filename: str, circular_contigs: list, pseudo: bool = False) -> (Organism, bool): +def read_anno_file(organism_name: str, filename: str, circular_contigs: list, pseudo: bool = False) -> Tuple[Organism, bool]: """ Read a GBFF file for one organism @@ -391,7 +392,7 @@ def read_anno_file(organism_name: str, filename: str, circular_contigs: list, ps :param circular_contigs: list of sequence in contig :param pseudo: allow to read pseudogène - :return: Annotated organism for pangenome + :return: Annotated organism for pangenome and true for sequence in file """ filetype = detect_filetype(filename) if filetype == "gff": From 93b2014ae1a22df57b88206c4b9e1a98d24d5df0 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 4 Jul 2023 10:06:49 +0200 Subject: [PATCH 026/173] add some docstring --- ppanggolin/RGP/genomicIsland.py | 49 ++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 7 deletions(-) diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index c03ba66f..ba17157b 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -189,27 +189,62 @@ def max_index_node(lst): return contig_regions -def compute_org_rgp(organism: Organism, multigenics: set, persistent_penalty: int = 3, variable_gain: int = 1, - min_length: int = 3000, min_score: int = 4, naming: str = "contig") -> set: +def compute_org_rgp( + organism: Organism, + multigenics: set, + persistent_penalty: int = 3, + variable_gain: int = 1, + min_length: int = 3000, + min_score: int = 4, + naming: str = "contig" +) -> set: + """ + Compute RGP on the given organism based on the provided parameters. + + :param organism: Organism object representing the organism. + :param multigenics: Set of multigenic regions. + :param persistent_penalty: Penalty for persistent multigenic regions (default: 3). + :param variable_gain: Gain for variable multigenic regions (default: 1). + :param min_length: Minimum length threshold for organization regions (default: 3000). + :param min_score: Minimum score threshold for organization regions (default: 4). + :param naming: Naming scheme for the regions (default: "contig"). + :return: Set of organization regions. + """ org_regions = set() for contig in organism.contigs: if len(contig.genes) != 0: # some contigs have no coding genes... # can definitely multiprocess this part, as not THAT much information is needed... matrix = init_matrices(contig, multigenics, persistent_penalty, variable_gain) - org_regions |= mk_regions(contig, matrix, multigenics, min_length, min_score, persistent_penalty, - variable_gain, naming=naming) + org_regions |= mk_regions( + contig, + matrix, + multigenics, + min_length, + min_score, + persistent_penalty, + variable_gain, + naming=naming + ) return org_regions -def naming_scheme(pangenome: Pangenome): +def naming_scheme(pangenome: Pangenome) -> str: + """ + Determine the naming scheme for the contigs in the pangenome. + + :param pangenome: Pangenome object representing the pangenome. + :return: Naming scheme for the contigs ("contig" or "organism"). + """ contigsids = set() for org in pangenome.organisms: for contig in org.contigs: oldlen = len(contigsids) contigsids.add(contig.name) if oldlen == len(contigsids): - logging.getLogger().warning("You have contigs with identical identifiers in your assemblies. " - "identifiers will be supplemented with your provided organism names.") + logging.getLogger().warning( + "You have contigs with identical identifiers in your assemblies. " + "Identifiers will be supplemented with your provided organism names." + ) return "organism" return "contig" From 8f441f317e50fa44dd8197a789844213561c8c27 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 4 Jul 2023 11:56:26 +0200 Subject: [PATCH 027/173] correct docstring --- ppanggolin/RGP/genomicIsland.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index ba17157b..259b837c 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -202,11 +202,11 @@ def compute_org_rgp( Compute RGP on the given organism based on the provided parameters. :param organism: Organism object representing the organism. - :param multigenics: Set of multigenic regions. - :param persistent_penalty: Penalty for persistent multigenic regions (default: 3). - :param variable_gain: Gain for variable multigenic regions (default: 1). - :param min_length: Minimum length threshold for organization regions (default: 3000). - :param min_score: Minimum score threshold for organization regions (default: 4). + :param multigenics: multigenic persistent families of the pangenome graph. + :param persistent_penalty: Penalty for persistent multigenic families (default: 3). + :param variable_gain: Gain for variable multigenic families (default: 1). + :param min_length: Minimum length threshold for regions (default: 3000). + :param min_score: Minimum score threshold for regions (default: 4). :param naming: Naming scheme for the regions (default: "contig"). :return: Set of organization regions. """ @@ -232,7 +232,7 @@ def naming_scheme(pangenome: Pangenome) -> str: """ Determine the naming scheme for the contigs in the pangenome. - :param pangenome: Pangenome object representing the pangenome. + :param pangenome: Pangenome object :return: Naming scheme for the contigs ("contig" or "organism"). """ contigsids = set() From 67477bb26cefdad9c570d66eb346373f07f1ca1e Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 4 Jul 2023 11:58:12 +0200 Subject: [PATCH 028/173] add possibility to give config step params in a wf fashion for projection --- ppanggolin/projection/projection.py | 118 +++++++++++++++++----------- ppanggolin/utility/utils.py | 4 + ppanggolin/utils.py | 38 ++++++++- 3 files changed, 114 insertions(+), 46 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 89b13362..bfc4d7ba 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -24,10 +24,11 @@ from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations from ppanggolin.formats import check_pangenome_info # from ppanggolin.formats import write_pangenome +from ppanggolin.RGP.genomicIsland import naming_scheme, compute_org_rgp +from ppanggolin.formats.readBinaries import retrieve_pangenome_parameters - -def annotate_input_genome_with_pangenome(pangenome, input_organism, output, basename, cpu, no_defrag, identity, coverage, tmpdir, - disable_bar, translation_table ): +def annotate_input_genes_with_pangenome_families(pangenome, input_organism, output, cpu, no_defrag, identity, coverage, tmpdir, + disable_bar, translation_table, ): """ @@ -46,6 +47,22 @@ def annotate_input_genome_with_pangenome(pangenome, input_organism, output, bas project_partition(seq2pan, seq_set, str(output)) + +def compute_RGP(pangenome, input_organism, dup_margin, persistent_penalty, variable_gain, min_length, min_score): + + ## Computing RGPs ## + logging.getLogger().info("Detecting multigenic families...") + multigenics = pangenome.get_multigenics(dup_margin) + + logging.getLogger().info("Compute Regions of Genomic Plasticity ...") + name_scheme = naming_scheme(pangenome) + + compute_org_rgp(input_organism, multigenics, persistent_penalty, variable_gain, min_length, + min_score, naming=name_scheme) + + + + def retrieve_gene_sequences_from_fasta_file(input_organism, fasta_file): """ Get gene sequences from fastas @@ -73,6 +90,8 @@ def retrieve_gene_sequences_from_fasta_file(input_organism, fasta_file): raise KeyError(msg) + + def launch(args: argparse.Namespace): """ Command launcher @@ -86,13 +105,19 @@ def launch(args: argparse.Namespace): # TODO check that the provided input_organism name is not found in pangenome # if so add a warning or error + # TODO some params are no keep in pangenome... like use_pseudo. what to do? + logging.getLogger().info('Retrieving pangenome parameters from the provided pangenome file.') + + step_to_params = retrieve_pangenome_parameters(args.pangenome) + annotation_params_str = " ".join([f"{param}={value}" for param, value in step_to_params["annotation"].items()]) + logging.getLogger().debug(f'annotation params {annotation_params_str}' ) - if args.anno_file is not None: + if args.annot_file is not None: # read_annotations(pangenome, args.anno, cpu=args.cpu, pseudo=args.use_pseudo, disable_bar=args.disable_prog_bar) input_organism, has_sequence = read_anno_file(organism_name = args.organism_name, - filename=args.anno_file, + filename=args.annot_file, circular_contigs=[], - pseudo=args.use_pseudo) + pseudo=False) if not has_sequence: if args.fasta_file: @@ -103,8 +128,8 @@ def launch(args: argparse.Namespace): elif args.fasta_file is not None: input_organism = annotate_organism(org_name=args.organism_name, file_name = args.fasta_file, circular_contigs=[], tmpdir=args.tmpdir, - code = args.translation_table, norna=args.norna, kingdom = args.kingdom, - overlap=args.allow_overlap, procedure=args.prodigal_procedure) + code = args.annotate.translation_table, norna=args.annotate.norna, kingdom = args.annotate.kingdom, + overlap=args.annotate.allow_overlap, procedure=args.annotate.prodigal_procedure) else: raise Exception("At least one of --fasta_file or --anno_file must be given") @@ -116,10 +141,14 @@ def launch(args: argparse.Namespace): pangenome.add_file(args.pangenome) check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar) - annotate_input_genome_with_pangenome(pangenome, input_organism=input_organism, output=output_dir, basename=args.basename, cpu=args.cpu, - no_defrag = args.no_defrag, identity = args.identity, coverage = args.coverage, tmpdir=args.tmpdir, - disable_bar=args.disable_prog_bar, translation_table = args.translation_table ) + annotate_input_genes_with_pangenome_families(pangenome, input_organism=input_organism, output=output_dir, cpu=args.cluster.cpu, + no_defrag = args.cluster.no_defrag, identity = args.cluster.identity, coverage = args.cluster.coverage, tmpdir=args.tmpdir, + disable_bar=args.disable_prog_bar, translation_table = args.annotate.translation_table ) + + # compute_RGP(pangenome, input_organism, dup_margin=0.05, persistent_penalty=3, variable_gain=1, min_length=3000, min_score=4) + + def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: """ @@ -145,13 +174,13 @@ def parser_projection(parser: argparse.ArgumentParser): required.add_argument('-p', '--pangenome', required=False, type=str, help="The pangenome.h5 file") required.add_argument('--organism_name', required=False, type=str, - help="Name of the input_organism whose genome is being projected onto the provided pangenome.") + help="Name of the input_organism whose genome is being annotated with the provided pangenome.") required.add_argument('--fasta_file', required=False, type=str, help="The filepath of the genomic sequence(s) in FASTA format for the projected genome. " "(Fasta file can be compressed with gzip)") - required.add_argument('--anno_file', required=False, type=str, + required.add_argument('--annot_file', required=False, type=str, help="The filepath of the annotations in GFF/GBFF format for the projected genome. " "(Annotation file can be compressed with gzip)") @@ -168,36 +197,35 @@ def parser_projection(parser: argparse.ArgumentParser): default="ppanggolin_output" + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", time.localtime()) + "_PID" + str(os.getpid()), help="Output directory") - optional.add_argument("--basename", required=False, default="pangenome", help="basename for the output file") - optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") - - annotate = parser.add_argument_group(title="Annotation arguments") - - - annotate.add_argument('--allow_overlap', required=False, action='store_true', default=False, - help="Use to not remove genes overlapping with RNA features.") - annotate.add_argument("--norna", required=False, action="store_true", default=False, - help="Use to avoid annotating RNA features.") - annotate.add_argument("--kingdom", required=False, type=str.lower, default="bacteria", - choices=["bacteria", "archaea"], - help="Kingdom to which the prokaryota belongs to, " - "to know which models to use for rRNA annotation.") - annotate.add_argument("--translation_table", required=False, type=int, default=11, - help="Translation table (genetic code) to use.") - - annotate.add_argument("--prodigal_procedure", required=False, type=str.lower, choices=["single", "meta"], - default=None, help="Allow to force the prodigal procedure. " - "If nothing given, PPanGGOLiN will decide in function of contig length") - annotate.add_argument("--use_pseudo", required=False, action="store_true", - help="In the context of provided annotation, use this option to read pseudogenes. " - "(Default behavior is to ignore them)") - - cluster = parser.add_argument_group(title="Clustering arguments") - cluster.add_argument('--no_defrag', required=False, action="store_true", - help="DO NOT Realign gene families to link fragments with" - "their non-fragmented gene family.") - cluster.add_argument('--identity', required=False, type=float, default=0.5, - help="min identity percentage threshold") - cluster.add_argument('--coverage', required=False, type=float, default=0.8, - help="min coverage percentage threshold") + # optional.add_argument("--basename", required=False, default="pangenome", help="basename for the output file") + + # annotate = parser.add_argument_group(title="Annotation arguments") + + + # annotate.add_argument('--allow_overlap', required=False, action='store_true', default=False, + # help="Use to not remove genes overlapping with RNA features.") + # annotate.add_argument("--norna", required=False, action="store_true", default=False, + # help="Use to avoid annotating RNA features.") + # annotate.add_argument("--kingdom", required=False, type=str.lower, default="bacteria", + # choices=["bacteria", "archaea"], + # help="Kingdom to which the prokaryota belongs to, " + # "to know which models to use for rRNA annotation.") + # annotate.add_argument("--translation_table", required=False, type=int, default=11, + # help="Translation table (genetic code) to use.") + + # annotate.add_argument("--prodigal_procedure", required=False, type=str.lower, choices=["single", "meta"], + # default=None, help="Allow to force the prodigal procedure. " + # "If nothing given, PPanGGOLiN will decide in function of contig length") + # annotate.add_argument("--use_pseudo", required=False, action="store_true", + # help="In the context of provided annotation, use this option to read pseudogenes. " + # "(Default behavior is to ignore them)") + + # cluster = parser.add_argument_group(title="Clustering arguments") + # cluster.add_argument('--no_defrag', required=False, action="store_true", + # help="DO NOT Realign gene families to link fragments with" + # "their non-fragmented gene family.") + # cluster.add_argument('--identity', required=False, type=float, default=0.5, + # help="min identity percentage threshold") + # cluster.add_argument('--coverage', required=False, type=float, default=0.8, + # help="min coverage percentage threshold") diff --git a/ppanggolin/utility/utils.py b/ppanggolin/utility/utils.py index c49595ba..c217a1f4 100644 --- a/ppanggolin/utility/utils.py +++ b/ppanggolin/utility/utils.py @@ -150,6 +150,10 @@ def launch_default_config(args: argparse.Namespace): # it is clearer if the order of the subcommand is conserved in wf config file commands = [initial_command] + [sub_cmd for sub_cmd in ALL_WORKFLOW_DEPENDENCIES if sub_cmd in workflow_dependencies] + elif initial_command == "projection": + commands = [initial_command] + [sub_cmd for sub_cmd in ALL_WORKFLOW_DEPENDENCIES if + sub_cmd not in ["write", "draw"]] + else: commands = [initial_command] diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index af5511d4..78a91c12 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -25,7 +25,8 @@ from ppanggolin.geneFamily import GeneFamily # all input params that exists in ppanggolin -ALL_INPUT_PARAMS = ['fasta', 'anno', 'clusters', 'pangenome', "fasta_file", "annot_file"] +ALL_INPUT_PARAMS = ['fasta', 'anno', 'clusters', 'pangenome', + "fasta_file", "annot_file", "organism_name"] # the last three params is for projection cmd # all params that should be in the general_parameters section of the config file ALL_GENERAL_PARAMS = ['output', 'basename', 'rarefaction', 'no_flat_files', 'tmpdir', 'verbose', 'log', @@ -683,6 +684,41 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ # Add args namespace of the step to the inital args namespace setattr(args, workflow_step, step_args) + # manage projection step parameters + elif subcommand == "projection" : + for projection_step in ["annotate", "cluster", "graph", "partition", "rarefaction", "rgp", "spot", "module"]: + + logging.getLogger().debug(f'Parsing {projection_step} arguments in config file.') + step_subparser = subcommand_to_subparser[projection_step] + + default_step_args = get_default_args(projection_step, step_subparser, unwanted_args=all_unspecific_params) + + # remove general args + all_param_names = {arg_name for arg_name in dir(default_step_args) if not arg_name.startswith('_')} + specific_step_params = {param_name for param_name in all_param_names if + param_name not in all_unspecific_params} + config_step_args = get_config_args(projection_step, step_subparser, config, projection_step, + specific_step_params, strict_config_check=True) + + step_args = overwrite_args(default_step_args, config_step_args, cli_args) + + step_params_that_differ = get_args_that_differe_from_default(default_step_args, step_args) + + if step_params_that_differ: + step_params_that_differ_str = ', '.join([f'{p}={v}' for p, v in step_params_that_differ.items()]) + logging.getLogger().debug( + f"{len(step_params_that_differ)} {projection_step} parameters have a non-default value: {step_params_that_differ_str}") + + # add step name to differentiate the params + step_params_that_differ = {f'{projection_step}:{param}': value for param, value in + step_params_that_differ.items()} + + params_that_differ.update(step_params_that_differ) + + # Add args namespace of the step to the inital args namespace + setattr(args, projection_step, step_args) + + if params_that_differ: logging.getLogger().info(f'{len(params_that_differ)} parameters have a non-default value.') From a2175a5fe6587f3598ee1b88a2c880e947d4a53a Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 4 Jul 2023 16:03:12 +0200 Subject: [PATCH 029/173] improve clarity of variable and function names --- ppanggolin/align/alignOnPang.py | 16 ++++++++-------- ppanggolin/context/searchGeneContext.py | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 96941503..29a9326b 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -8,7 +8,7 @@ import subprocess import argparse from collections import defaultdict -from typing import Tuple, Set, Dict +from typing import Tuple, Set, Dict, Iterator # local libraries from ppanggolin.formats import check_pangenome_info @@ -149,11 +149,11 @@ def write_gene_fam_sequences(pangenome: Pangenome, file_obj: TextIOWrapper, add: # file_obj.flush() -def project_partition(seq_to_pang: Dict[str, GeneFamily], seq_set: Set[str], output: str) -> str: +def project_and_write_partition(seqid_to_gene_family: Dict[str, GeneFamily], seq_set: Set[str], output: str) -> str: """ - Project the partition of each sequence from the input file + Project the partition of each sequence from the input file and write them in a file - :param seq_to_pang: dictionnary which link sequence and pangenome + :param seqid_to_gene_family: dictionnary which link sequence and pangenome gene family :param seq_set: input sequences :param output: Path of the output directory @@ -162,10 +162,10 @@ def project_partition(seq_to_pang: Dict[str, GeneFamily], seq_set: Set[str], out partition_proj = output + "/sequences_partition_projection.tsv" with open(partition_proj, "w") as partProjFile: - for input_seq, pangFam in seq_to_pang.items(): + for input_seq, pangFam in seqid_to_gene_family.items(): partProjFile.write(input_seq + "\t" + pangFam.named_partition + "\n") - for remainingSeq in (seq_to_pang.keys() & seq_set): - partProjFile.write(remainingSeq + "\tcloud\n") # if there is no hit, it's going to be cloud genes. + for remainingSeq in (seqid_to_gene_family.keys() & seq_set): + partProjFile.write(remainingSeq + "\tCloud\n") # if there is no hit, it's going to be cloud genes. return partition_proj @@ -365,7 +365,7 @@ def align(pangenome: Pangenome, sequence_file: str, output: str, tmpdir: str, id if getinfo or draw_related: # TODO Add getinfo to function and remove if get_seq_info(seq2pang, pangenome, output, draw_related, disable_bar=disable_bar) - part_proj = project_partition(seq2pang, seq_set, output) # write the partition assignation only + part_proj = project_and_write_partition(seq2pang, seq_set, output) # write the partition assignation only logging.getLogger().info(f"sequences partition projection : '{part_proj}'") logging.getLogger().info(f"{len(seq2pang)} sequences over {len(seq_set)} have at least one hit in the pangenome.") logging.getLogger().info(f"Blast-tab file of the alignment : '{align_file}'") diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index e8db598e..c0ab4146 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -17,7 +17,7 @@ from ppanggolin.genome import Gene, Contig from ppanggolin.utils import mk_outdir, restricted_float, add_gene, connected_components from ppanggolin.pangenome import Pangenome -from ppanggolin.align.alignOnPang import get_seq2pang, project_partition +from ppanggolin.align.alignOnPang import get_seq2pang, project_and_write_partition from ppanggolin.region import GeneContext @@ -55,7 +55,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: new_tmpdir = tempfile.TemporaryDirectory(dir=tmpdir) seq_set, _, seq2pan = get_seq2pang(pangenome, sequences, output, new_tmpdir, cpu, no_defrag, identity, coverage) - project_partition(seq2pan, seq_set, output) + project_and_write_partition(seq2pan, seq_set, output) new_tmpdir.cleanup() for k, v in seq2pan.items(): gene_families[v.name] = v From e48730df3f6442701cee0789fdb5caf0142f1069 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 4 Jul 2023 16:04:40 +0200 Subject: [PATCH 030/173] improve tqdm bar to a have total of genes --- ppanggolin/formats/writeSequences.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 9d601d11..a881b749 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -40,7 +40,7 @@ def write_gene_sequences_from_annotations(pangenome: Pangenome, file_obj: TextIO if list_cds is None: list_cds = pangenome.genes logging.getLogger().info("Writing all of the CDS sequences...") - for gene in tqdm(list_cds, unit="gene", disable=disable_bar): + for gene in tqdm(list_cds, unit="gene", total=pangenome.number_of_genes(), disable=disable_bar): if gene.type == "CDS": counter += 1 file_obj.write('>' + add + gene.ID + "\n") From e825dd1fe74c23353a0efded4518dbb67c424a8d Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 4 Jul 2023 16:05:25 +0200 Subject: [PATCH 031/173] improve docstring and add tqdm bar --- ppanggolin/RGP/genomicIsland.py | 35 ++++++++++++++------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index 259b837c..940829ef 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -189,29 +189,24 @@ def max_index_node(lst): return contig_regions -def compute_org_rgp( - organism: Organism, - multigenics: set, - persistent_penalty: int = 3, - variable_gain: int = 1, - min_length: int = 3000, - min_score: int = 4, - naming: str = "contig" -) -> set: +def compute_org_rgp( organism: Organism, multigenics: set, + persistent_penalty: int = 3, variable_gain: int = 1, min_length: int = 3000, min_score: int = 4, + naming: str = "contig", disable_bar: bool = True ) -> set: """ - Compute RGP on the given organism based on the provided parameters. - - :param organism: Organism object representing the organism. - :param multigenics: multigenic persistent families of the pangenome graph. - :param persistent_penalty: Penalty for persistent multigenic families (default: 3). - :param variable_gain: Gain for variable multigenic families (default: 1). - :param min_length: Minimum length threshold for regions (default: 3000). - :param min_score: Minimum score threshold for regions (default: 4). - :param naming: Naming scheme for the regions (default: "contig"). - :return: Set of organization regions. + Compute regions of genomic plasticity (RGP) on the given organism based on the provided parameters. + + :param organism: The Organism object representing the organism. + :param multigenics: A set of multigenic persistent families of the pangenome graph. + :param persistent_penalty: Penalty score to apply to persistent multigenic families (default: 3). + :param variable_gain: Gain score to apply to variable multigenic families (default: 1). + :param min_length: Minimum length threshold (in base pairs) for the regions to be considered RGP (default: 3000). + :param min_score: Minimum score threshold for considering a region as RGP (default: 4). + :param naming: Naming scheme for the regions, either "contig" or "organism" (default: "contig"). + :param disable_bar: Whether to disable the progress bar. It is recommended to disable it when calling this function in a loop on multiple organisms (default: True). + :return: A set of organization regions representing the predicted RGPs. """ org_regions = set() - for contig in organism.contigs: + for contig in tqdm(organism.contigs, total=len(organism.contigs), unit="contig", disable=disable_bar): if len(contig.genes) != 0: # some contigs have no coding genes... # can definitely multiprocess this part, as not THAT much information is needed... matrix = init_matrices(contig, multigenics, persistent_penalty, variable_gain) From f18fa5fa42caf9773cdff6ea0c17a57ab5b05e3b Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 4 Jul 2023 16:46:37 +0200 Subject: [PATCH 032/173] predict rgps in input genome --- ppanggolin/projection/projection.py | 127 ++++++++++++++++++++++------ 1 file changed, 101 insertions(+), 26 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index bfc4d7ba..9e6f84a7 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -9,7 +9,7 @@ import time from pathlib import Path import tempfile - +from typing import Tuple, Set, Dict, Iterator # installed libraries from tqdm import tqdm @@ -18,14 +18,19 @@ from ppanggolin.annotate.synta import annotate_organism, read_fasta, get_dna_sequence from ppanggolin.annotate.annotate import read_anno_file from ppanggolin.pangenome import Pangenome +from ppanggolin.cluster.cluster import infer_singletons # from ppanggolin.genome import input_organism, Gene, RNA, Contig -from ppanggolin.utils import read_compressed_or_not, mk_file_name, min_one, restricted_float, mk_outdir -from ppanggolin.align.alignOnPang import get_seq2pang, project_partition +from ppanggolin.utils import read_compressed_or_not, write_compressed_or_not, mk_file_name, min_one, restricted_float, mk_outdir +from ppanggolin.align.alignOnPang import get_seq2pang, project_and_write_partition from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations from ppanggolin.formats import check_pangenome_info # from ppanggolin.formats import write_pangenome from ppanggolin.RGP.genomicIsland import naming_scheme, compute_org_rgp -from ppanggolin.formats.readBinaries import retrieve_pangenome_parameters +# from ppanggolin.formats.readBinaries import retrieve_pangenome_parameters +from ppanggolin.genome import Organism, Gene, RNA, Contig +from ppanggolin.geneFamily import GeneFamily +from ppanggolin.region import Region +from ppanggolin.formats.writeFlat import write_flat_files def annotate_input_genes_with_pangenome_families(pangenome, input_organism, output, cpu, no_defrag, identity, coverage, tmpdir, disable_bar, translation_table, ): @@ -42,27 +47,77 @@ def annotate_input_genes_with_pangenome_families(pangenome, input_organism, out # get corresponding gene families new_tmpdir = tempfile.TemporaryDirectory(dir=tmpdir, prefix="seq_to_pang_tmpdir_") - seq_set, _, seq2pan = get_seq2pang(pangenome, str(seq_fasta_file), str(output), new_tmpdir, cpu, no_defrag, identity=identity, + seq_set, _, seqid_to_gene_family = get_seq2pang(pangenome, str(seq_fasta_file), str(output), new_tmpdir, cpu, no_defrag, identity=identity, coverage=coverage, is_protein=False, translation_table=translation_table) - project_partition(seq2pan, seq_set, str(output)) + # this function only write the seqid and partition associated in a file + project_and_write_partition(seqid_to_gene_family, seq_set, str(output)) + # Add gene of the input organism in the associated gene family + # when a gene is not associated with any gene family, a new family is created. + lonely_gene = 0 + for gene in input_organism.genes: + try: + gene_family = seqid_to_gene_family[gene.ID] + gene_family.add_gene(gene) -def compute_RGP(pangenome, input_organism, dup_margin, persistent_penalty, variable_gain, min_length, min_score): - - ## Computing RGPs ## + except KeyError: + # add a new gene family + new_gene_family = pangenome.add_gene_family(gene.ID) + new_gene_family.add_gene(gene) + new_gene_family.add_partition("Cloud") + lonely_gene += 1 + + logging.getLogger().info(f"The input organisms have {lonely_gene}/{input_organism.number_of_genes()} " + "genes that do not cluster with any of the gene families of the pangenome.") + + +def compute_RGP(pangenome: Pangenome, input_organism: Organism, persistent_penalty: int, variable_gain: int, + min_length: int, min_score: int, dup_margin: float, + disable_bar: bool) -> None: + """ + Compute Regions of Genomic Plasticity (RGP) for the given pangenome and input organism. + + :param pangenome: The pangenome object. + :param input_organism: The input organism for which to compute RGPs. + :param persistent_penalty: Penalty score to apply to persistent genes. + :param variable_gain: Gain score to apply to variable genes. + :param min_length: Minimum length (bp) of a region to be considered as RGP. + :param min_score: Minimal score required for considering a region as RGP. + :param dup_margin: Minimum ratio of organisms in which a family must have multiple genes to be considered duplicated. + :param disable_bar: Flag to disable the progress bar. + + :return: None + """ logging.getLogger().info("Detecting multigenic families...") multigenics = pangenome.get_multigenics(dup_margin) - logging.getLogger().info("Compute Regions of Genomic Plasticity ...") + logging.getLogger().info("Computing Regions of Genomic Plasticity...") name_scheme = naming_scheme(pangenome) - compute_org_rgp(input_organism, multigenics, persistent_penalty, variable_gain, min_length, - min_score, naming=name_scheme) - + rgps = compute_org_rgp(input_organism, multigenics, persistent_penalty, variable_gain, min_length, + min_score, naming=name_scheme, disable_bar=disable_bar) + print("Found RGPS ", len(rgps)) + return rgps +def write_predicted_regions(regions : Set[Region], output:Path, compress=False): + """ + Write the file providing information about RGP content + + :param output: Path to output directory + :param compress: Compress the file in .gz + """ + fname = output / "plastic_regions.tsv" + with write_compressed_or_not(fname, compress) as tab: + tab.write("region\torganism\tcontig\tstart\tstop\tgenes\tcontigBorder\twholeContig\n") + regions = sorted(regions, key=lambda x: (x.organism.name, x.contig.name, x.start)) + for region in regions: + tab.write('\t'.join(map(str, [region.name, region.organism, region.contig, region.start, region.stop, + len(region.genes), region.is_contig_border, region.is_whole_contig])) + "\n") + + def retrieve_gene_sequences_from_fasta_file(input_organism, fasta_file): """ Get gene sequences from fastas @@ -99,6 +154,7 @@ def launch(args: argparse.Namespace): :param args: All arguments provide by user """ + output_dir = Path(args.output) mk_outdir(output_dir, args.force) @@ -108,9 +164,6 @@ def launch(args: argparse.Namespace): # TODO some params are no keep in pangenome... like use_pseudo. what to do? logging.getLogger().info('Retrieving pangenome parameters from the provided pangenome file.') - step_to_params = retrieve_pangenome_parameters(args.pangenome) - annotation_params_str = " ".join([f"{param}={value}" for param, value in step_to_params["annotation"].items()]) - logging.getLogger().debug(f'annotation params {annotation_params_str}' ) if args.annot_file is not None: # read_annotations(pangenome, args.anno, cpu=args.cpu, pseudo=args.use_pseudo, disable_bar=args.disable_prog_bar) @@ -139,17 +192,40 @@ def launch(args: argparse.Namespace): pangenome = Pangenome() pangenome.add_file(args.pangenome) - check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar) + + check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar, need_rgp=args.predict_rgp) + + # Add input organism in pangenome. This temporary as pangenome is not going to be written. + pangenome.add_organism(input_organism) annotate_input_genes_with_pangenome_families(pangenome, input_organism=input_organism, output=output_dir, cpu=args.cluster.cpu, no_defrag = args.cluster.no_defrag, identity = args.cluster.identity, coverage = args.cluster.coverage, tmpdir=args.tmpdir, disable_bar=args.disable_prog_bar, translation_table = args.annotate.translation_table ) - # compute_RGP(pangenome, input_organism, dup_margin=0.05, persistent_penalty=3, variable_gain=1, min_length=3000, min_score=4) + if args.predict_rgp: + + rgps = compute_RGP(pangenome, input_organism, persistent_penalty=args.rgp.persistent_penalty, variable_gain=args.rgp.variable_gain, + min_length=args.rgp.min_length, min_score=args.rgp.min_score, dup_margin=args.rgp.dup_margin, + disable_bar=args.disable_prog_bar) + + write_predicted_regions(rgps, output=output_dir) + + + # write_flat_files_for_input_genome(input_organism) + +# def write_flat_files_for_input_genome(input_organism): + +# # create a pangenome object with only the input organism +# pangenome = Pangenome() +# pangenome.add_organism(input_organism) + +# write_flat_files(pangenome, regions=True) + + def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: """ Subparser to launch PPanGGOLiN in Command line @@ -184,19 +260,18 @@ def parser_projection(parser: argparse.ArgumentParser): help="The filepath of the annotations in GFF/GBFF format for the projected genome. " "(Annotation file can be compressed with gzip)") - # required.add_argument('--fasta', required=False, type=str, - # help="A tab-separated file listing the input_organism names, and the fasta filepath of its genomic " - # "sequence(s) (the fastas can be compressed with gzip). One line per input_organism.") - - # required.add_argument('--anno', required=False, type=str, - # help="A tab-separated file listing the input_organism names, and the gff/gbff filepath of its " - # "annotations (the files can be compressed with gzip). One line per input_organism. " - # "If this is provided, those annotations will be used.") + + optional = parser.add_argument_group(title="Optional arguments") optional.add_argument('-o', '--output', required=False, type=str, default="ppanggolin_output" + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", time.localtime()) + "_PID" + str(os.getpid()), help="Output directory") + + optional.add_argument('--predict_rgp', required=False, action='store_true', default=False, + help="Predict rgp on the input genome.") + optional.add_argument('--project_modules', required=False, action='store_true', default=False, + help="Predict rgp on the input genome.") # optional.add_argument("--basename", required=False, default="pangenome", help="basename for the output file") # annotate = parser.add_argument_group(title="Annotation arguments") From e3993703377048c7da9c58cd39a5b2ad08597054 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 4 Jul 2023 17:23:54 +0200 Subject: [PATCH 033/173] project module onto input genomes --- ppanggolin/projection/projection.py | 47 ++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 9e6f84a7..74409686 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -98,7 +98,8 @@ def compute_RGP(pangenome: Pangenome, input_organism: Organism, persistent_penal rgps = compute_org_rgp(input_organism, multigenics, persistent_penalty, variable_gain, min_length, min_score, naming=name_scheme, disable_bar=disable_bar) - print("Found RGPS ", len(rgps)) + # print("Found RGPS ", len(rgps)) + # TODO add number of RGP found.. return rgps @@ -193,7 +194,9 @@ def launch(args: argparse.Namespace): pangenome = Pangenome() pangenome.add_file(args.pangenome) - check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar, need_rgp=args.predict_rgp) + check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar, + need_rgp=args.predict_rgp, need_modules=args.project_modules, + need_spots=args.project_spots) # Add input organism in pangenome. This temporary as pangenome is not going to be written. pangenome.add_organism(input_organism) @@ -211,9 +214,43 @@ def launch(args: argparse.Namespace): write_predicted_regions(rgps, output=output_dir) - + if args.project_modules: + write_projected_modules_to_input_organism(pangenome, input_organism, output_dir) + if args.project_spots: + pass + # write_flat_files_for_input_genome(input_organism) +def write_projected_modules_to_input_organism(pangenome, input_organism, output, compress=False): + """Write a tsv file providing association between modules and organisms + + :param output: Path to output directory + :param compress: Compress the file in .gz + """ + + # TODO ad logging with number of module associated with the input orga + output_file = output / "modules_in_input_organism.tsv" + + logging.getLogger().info("Writing modules to organisms associations...") + + input_organism_families = input_organism.families + + with write_compressed_or_not(output_file, compress) as fout: + fout.write("module_id\torganism\tcompletion\n") + + for mod in pangenome.modules: + mod_orgs = set() + + module_in_input_organism = any((fam in input_organism_families for fam in mod.families)) + + if module_in_input_organism: + + completion = round(len(input_organism.families & mod.families) / len(mod.families), 2) + fout.write(f"module_{mod.ID}\t{input_organism.name}\t{completion}\n") + + logging.getLogger().info( + f"Writing projected modules to input organism : '{output_file}'") + # def write_flat_files_for_input_genome(input_organism): @@ -271,7 +308,9 @@ def parser_projection(parser: argparse.ArgumentParser): optional.add_argument('--predict_rgp', required=False, action='store_true', default=False, help="Predict rgp on the input genome.") optional.add_argument('--project_modules', required=False, action='store_true', default=False, - help="Predict rgp on the input genome.") + help="Project pangenome modules to the input genome.") + optional.add_argument('--project_spots', required=False, action='store_true', default=False, + help="Project pangenome spots to the input genome.") # optional.add_argument("--basename", required=False, default="pangenome", help="basename for the output file") # annotate = parser.add_argument_group(title="Annotation arguments") From c6d395750e14fa444ddcd21bd1ca0bdbc800cb24 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 4 Jul 2023 17:28:59 +0200 Subject: [PATCH 034/173] correct typo in write_gene_sequences_from_annotations --- ppanggolin/formats/writeSequences.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index a881b749..1527dd53 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -40,7 +40,7 @@ def write_gene_sequences_from_annotations(pangenome: Pangenome, file_obj: TextIO if list_cds is None: list_cds = pangenome.genes logging.getLogger().info("Writing all of the CDS sequences...") - for gene in tqdm(list_cds, unit="gene", total=pangenome.number_of_genes(), disable=disable_bar): + for gene in tqdm(list_cds, unit="gene", total=pangenome.number_of_gene(), disable=disable_bar): if gene.type == "CDS": counter += 1 file_obj.write('>' + add + gene.ID + "\n") From dab2f53f79c2ee8ca686ff1c0913900a6f722515 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 5 Jul 2023 09:57:45 +0200 Subject: [PATCH 035/173] improve write_gene_sequences_from_annotations fct --- ppanggolin/cluster/cluster.py | 2 +- ppanggolin/formats/writeSequences.py | 37 +++++++++++----------------- ppanggolin/projection/projection.py | 3 +-- 3 files changed, 16 insertions(+), 26 deletions(-) diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 3f5b3365..d9036f1f 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -53,7 +53,7 @@ def check_pangenome_for_clustering(pangenome: Pangenome, tmp_file: io.TextIO, fo check_pangenome_former_clustering(pangenome, force) if pangenome.status["geneSequences"] in ["Computed", "Loaded"]: # we append the gene ids by 'ppanggolin' to avoid crashes from mmseqs when sequence IDs are only numeric. - write_gene_sequences_from_annotations(pangenome, tmp_file, add="ppanggolin_", disable_bar=disable_bar) + write_gene_sequences_from_annotations(pangenome.genes, tmp_file, add="ppanggolin_", disable_bar=disable_bar) elif pangenome.status["geneSequences"] == "inFile": get_gene_sequences_from_file(pangenome.file, tmp_file, add="ppanggolin_", disable_bar=disable_bar) # write CDS sequences to the tmpFile diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 1527dd53..ec679190 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -20,34 +20,25 @@ "'core', 'module_X' with X being a module id." -def write_gene_sequences_from_annotations(pangenome: Pangenome, file_obj: TextIO, list_cds: list = None, add: str = '', - seq_attr_to_write: str = "dna" , +def write_gene_sequences_from_annotations(genes_to_write: Pangenome, file_obj: TextIO, add: str = '', disable_bar: bool = False): """ - Writes the CDS sequences given through list_CDS of the Pangenome object to a tmpFile object, - and adds the str provided through add in front of it. - Loads the sequences from previously computed or loaded annotations - - :param pangenome: Pangenome object with gene families sequences - :param file_obj: Output file to write sequences - :param list_cds: Selected genes - :param add: Add prefix to gene ID - :param disable_bar: Disable progress bar + Writes the CDS sequences to a File object, + and adds the string provided through `add` in front of it. + Loads the sequences from previously computed or loaded annotations. + + :param genes_to_write: Genes to write. + :param file_obj: Output file to write sequences. + :param add: Add prefix to gene ID. + :param disable_bar: Disable progress bar. """ - assert seq_attr_to_write in ['dna', "protein"] - - counter = 0 - if list_cds is None: - list_cds = pangenome.genes - logging.getLogger().info("Writing all of the CDS sequences...") - for gene in tqdm(list_cds, unit="gene", total=pangenome.number_of_gene(), disable=disable_bar): + logging.getLogger().info("Writing CDS sequences...") + for gene in tqdm(genes_to_write, unit="gene", disable=disable_bar): if gene.type == "CDS": - counter += 1 - file_obj.write('>' + add + gene.ID + "\n") - file_obj.write(getattr(gene, seq_attr_to_write) + "\n") + file_obj.write(f'>{add}{gene.ID}\n') + file_obj.write(f'{gene.dna}\n') file_obj.flush() - def write_gene_sequences(pangenome: Pangenome, output: str, genes: str, soft_core: float = 0.95, compress: bool = False, disable_bar: bool = False): """ @@ -75,7 +66,7 @@ def write_gene_sequences(pangenome: Pangenome, output: str, genes: str, soft_cor get_gene_sequences_from_file(pangenome.file, fasta, set([gene.ID for gene in genes_to_write]), disable_bar=disable_bar) elif pangenome.status["geneSequences"] in ["Computed", "Loaded"]: - write_gene_sequences_from_annotations(pangenome, fasta, genes_to_write, disable_bar=disable_bar) + write_gene_sequences_from_annotations(genes_to_write, fasta, disable_bar=disable_bar) else: # this should never happen if the pangenome has been properly checked before launching this function. raise Exception("The pangenome does not include gene sequences") diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 74409686..3e759e6c 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -42,9 +42,8 @@ def annotate_input_genes_with_pangenome_families(pangenome, input_organism, out seq_fasta_file = output / f"{input_organism.name}.fasta" with open(seq_fasta_file, "w") as fh_out_faa: - write_gene_sequences_from_annotations(input_organism, fh_out_faa, seq_attr_to_write="dna", + write_gene_sequences_from_annotations(input_organism.genes, fh_out_faa, disable_bar=disable_bar) - # get corresponding gene families new_tmpdir = tempfile.TemporaryDirectory(dir=tmpdir, prefix="seq_to_pang_tmpdir_") seq_set, _, seqid_to_gene_family = get_seq2pang(pangenome, str(seq_fasta_file), str(output), new_tmpdir, cpu, no_defrag, identity=identity, From 572e4fa79a79437af21ec95b8d6acb3ebd0f6df9 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 5 Jul 2023 10:12:45 +0200 Subject: [PATCH 036/173] Add logging --- ppanggolin/projection/projection.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 3e759e6c..a4b5431e 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -97,8 +97,7 @@ def compute_RGP(pangenome: Pangenome, input_organism: Organism, persistent_penal rgps = compute_org_rgp(input_organism, multigenics, persistent_penalty, variable_gain, min_length, min_score, naming=name_scheme, disable_bar=disable_bar) - # print("Found RGPS ", len(rgps)) - # TODO add number of RGP found.. + logging.getLogger().info(f"{len(rgps)} RGPs have been predicted the input genomes.") return rgps @@ -227,41 +226,30 @@ def write_projected_modules_to_input_organism(pangenome, input_organism, output, :param compress: Compress the file in .gz """ - # TODO ad logging with number of module associated with the input orga output_file = output / "modules_in_input_organism.tsv" logging.getLogger().info("Writing modules to organisms associations...") input_organism_families = input_organism.families - + counter = 0 with write_compressed_or_not(output_file, compress) as fout: fout.write("module_id\torganism\tcompletion\n") for mod in pangenome.modules: - mod_orgs = set() - module_in_input_organism = any((fam in input_organism_families for fam in mod.families)) if module_in_input_organism: + counter += 1 completion = round(len(input_organism.families & mod.families) / len(mod.families), 2) fout.write(f"module_{mod.ID}\t{input_organism.name}\t{completion}\n") + logging.getLogger().info(f"{counter} modules have been projected to the input genomes.") + logging.getLogger().info( f"Writing projected modules to input organism : '{output_file}'") -# def write_flat_files_for_input_genome(input_organism): - -# # create a pangenome object with only the input organism -# pangenome = Pangenome() - -# pangenome.add_organism(input_organism) - - -# write_flat_files(pangenome, regions=True) - - def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: """ Subparser to launch PPanGGOLiN in Command line From 12ae973d3e6e80deb5922905b87707e112fe682a Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 5 Jul 2023 11:34:32 +0200 Subject: [PATCH 037/173] add spot prediction --- ppanggolin/RGP/spot.py | 1 + ppanggolin/projection/projection.py | 56 +++++++++++++++++++++-------- 2 files changed, 43 insertions(+), 14 deletions(-) diff --git a/ppanggolin/RGP/spot.py b/ppanggolin/RGP/spot.py index afd7cd81..b411a8ab 100644 --- a/ppanggolin/RGP/spot.py +++ b/ppanggolin/RGP/spot.py @@ -143,6 +143,7 @@ def add_new_node(g: nx.Graph, region: Region, borders: list): del graph_spot.nodes[node]["rgp"] nx.readwrite.gexf.write_gexf(graph_spot, output + "/spotGraph.gexf") + nx.readwrite.graphml.write_graphml(graph_spot, output + "/spotGraph.graphml") return spots diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index a4b5431e..e48e997d 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -26,6 +26,7 @@ from ppanggolin.formats import check_pangenome_info # from ppanggolin.formats import write_pangenome from ppanggolin.RGP.genomicIsland import naming_scheme, compute_org_rgp +from ppanggolin.RGP.spot import make_spot_graph # from ppanggolin.formats.readBinaries import retrieve_pangenome_parameters from ppanggolin.genome import Organism, Gene, RNA, Contig from ppanggolin.geneFamily import GeneFamily @@ -43,7 +44,7 @@ def annotate_input_genes_with_pangenome_families(pangenome, input_organism, out with open(seq_fasta_file, "w") as fh_out_faa: write_gene_sequences_from_annotations(input_organism.genes, fh_out_faa, - disable_bar=disable_bar) + disable_bar=True) # this progress bar is useless here.. so I disable it. # get corresponding gene families new_tmpdir = tempfile.TemporaryDirectory(dir=tmpdir, prefix="seq_to_pang_tmpdir_") seq_set, _, seqid_to_gene_family = get_seq2pang(pangenome, str(seq_fasta_file), str(output), new_tmpdir, cpu, no_defrag, identity=identity, @@ -71,8 +72,8 @@ def annotate_input_genes_with_pangenome_families(pangenome, input_organism, out "genes that do not cluster with any of the gene families of the pangenome.") -def compute_RGP(pangenome: Pangenome, input_organism: Organism, persistent_penalty: int, variable_gain: int, - min_length: int, min_score: int, dup_margin: float, +def predict_RGP(pangenome: Pangenome, input_organism: Organism, persistent_penalty: int, variable_gain: int, + min_length: int, min_score: int, multigenics: float, disable_bar: bool) -> None: """ Compute Regions of Genomic Plasticity (RGP) for the given pangenome and input organism. @@ -83,13 +84,11 @@ def compute_RGP(pangenome: Pangenome, input_organism: Organism, persistent_penal :param variable_gain: Gain score to apply to variable genes. :param min_length: Minimum length (bp) of a region to be considered as RGP. :param min_score: Minimal score required for considering a region as RGP. - :param dup_margin: Minimum ratio of organisms in which a family must have multiple genes to be considered duplicated. + :param multigenics: multigenic families. :param disable_bar: Flag to disable the progress bar. :return: None """ - logging.getLogger().info("Detecting multigenic families...") - multigenics = pangenome.get_multigenics(dup_margin) logging.getLogger().info("Computing Regions of Genomic Plasticity...") name_scheme = naming_scheme(pangenome) @@ -205,20 +204,49 @@ def launch(args: argparse.Namespace): if args.predict_rgp: + + logging.getLogger().info("Detecting multigenic families...") + multigenics = pangenome.get_multigenics(args.rgp.dup_margin) + + input_org_rgps = predict_RGP(pangenome, input_organism, persistent_penalty=args.rgp.persistent_penalty, variable_gain=args.rgp.variable_gain, + min_length=args.rgp.min_length, min_score=args.rgp.min_score, multigenics=multigenics, + disable_bar=args.disable_prog_bar) + all_rgps = list(input_org_rgps) + pangenome.regions - rgps = compute_RGP(pangenome, input_organism, persistent_penalty=args.rgp.persistent_penalty, variable_gain=args.rgp.variable_gain, - min_length=args.rgp.min_length, min_score=args.rgp.min_score, dup_margin=args.rgp.dup_margin, - disable_bar=args.disable_prog_bar) + write_predicted_regions(input_org_rgps, output=output_dir) + + spots = predict_spots(all_rgps, multigenics, output=output_dir, spot_graph=args.spot.spot_graph, + overlapping_match=args.spot.overlapping_match, set_size=args.spot.set_size, + exact_match=args.spot.exact_match_size) - write_predicted_regions(rgps, output=output_dir) if args.project_modules: write_projected_modules_to_input_organism(pangenome, input_organism, output_dir) - if args.project_spots: - pass - + # write_flat_files_for_input_genome(input_organism) + +def predict_spots(rgps: list, multigenics: set, output: str, + spot_graph: bool = False, overlapping_match: int = 2, set_size: int = 3, exact_match: int = 1): + + """ + Create a spot graph from pangenome RGP + + :param rgps: list of pangenome RGP + :param multigenics: pangenome graph multigenic persistent families + :param output: Output directory to save the spot graph + :param spot_graph: Writes gexf graph of pairs of blocks of single copy markers flanking RGPs from same hotspot + :param overlapping_match: Number of missing persistent genes allowed when comparing flanking genes + :param set_size: Number of single copy markers to use as flanking genes for RGP during hotspot computation + :param exact_match: Number of perfectly matching flanking single copy markers required to associate RGPs + + :return: list of computed spots + """ + + spots = make_spot_graph(rgps=rgps, multigenics=multigenics, output=output, spot_graph=spot_graph, + overlapping_match=overlapping_match, set_size=set_size, exact_match=exact_match) + + def write_projected_modules_to_input_organism(pangenome, input_organism, output, compress=False): """Write a tsv file providing association between modules and organisms @@ -293,7 +321,7 @@ def parser_projection(parser: argparse.ArgumentParser): help="Output directory") optional.add_argument('--predict_rgp', required=False, action='store_true', default=False, - help="Predict rgp on the input genome.") + help="Predict RGPs and hot spots on the input genome.") optional.add_argument('--project_modules', required=False, action='store_true', default=False, help="Project pangenome modules to the input genome.") optional.add_argument('--project_spots', required=False, action='store_true', default=False, From e6f3effb7e446cc6597e02f7aeb9708471eea593 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 5 Jul 2023 12:27:50 +0200 Subject: [PATCH 038/173] add Path type in subparser --- ppanggolin/projection/projection.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index e48e997d..b4f3f597 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -47,7 +47,7 @@ def annotate_input_genes_with_pangenome_families(pangenome, input_organism, out disable_bar=True) # this progress bar is useless here.. so I disable it. # get corresponding gene families new_tmpdir = tempfile.TemporaryDirectory(dir=tmpdir, prefix="seq_to_pang_tmpdir_") - seq_set, _, seqid_to_gene_family = get_seq2pang(pangenome, str(seq_fasta_file), str(output), new_tmpdir, cpu, no_defrag, identity=identity, + seq_set, _, seqid_to_gene_family = get_seq2pang(pangenome, seq_fasta_file, output, new_tmpdir, cpu, no_defrag, identity=identity, coverage=coverage, is_protein=False, translation_table=translation_table) # this function only write the seqid and partition associated in a file @@ -299,16 +299,16 @@ def parser_projection(parser: argparse.ArgumentParser): """ required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") - required.add_argument('-p', '--pangenome', required=False, type=str, help="The pangenome.h5 file") + required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome.h5 file") required.add_argument('--organism_name', required=False, type=str, help="Name of the input_organism whose genome is being annotated with the provided pangenome.") - required.add_argument('--fasta_file', required=False, type=str, + required.add_argument('--fasta_file', required=False, type=Path, help="The filepath of the genomic sequence(s) in FASTA format for the projected genome. " "(Fasta file can be compressed with gzip)") - required.add_argument('--annot_file', required=False, type=str, + required.add_argument('--annot_file', required=False, type=Path, help="The filepath of the annotations in GFF/GBFF format for the projected genome. " "(Annotation file can be compressed with gzip)") From 309f5a5ca40d64b42841d93e536ed7126d373a5c Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 6 Jul 2023 12:13:59 +0200 Subject: [PATCH 039/173] correct typo in spot graph writting --- ppanggolin/RGP/spot.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ppanggolin/RGP/spot.py b/ppanggolin/RGP/spot.py index 9bc5d5e9..32f31b88 100644 --- a/ppanggolin/RGP/spot.py +++ b/ppanggolin/RGP/spot.py @@ -142,9 +142,9 @@ def add_new_node(g: nx.Graph, region: Region, borders: list): del graph_spot.nodes[node]["border0"] del graph_spot.nodes[node]["border1"] del graph_spot.nodes[node]["rgp"] - - nx.readwrite.gexf.write_gexf(graph_spot, output / "/spotGraph.gexf") - nx.readwrite.graphml.write_graphml(graph_spot, output / "/spotGraph.graphml") + + nx.readwrite.gexf.write_gexf(graph_spot, output / "spotGraph.gexf") + nx.readwrite.graphml.write_graphml(graph_spot, output / "spotGraph.graphml") return spots From b9cc81fddd0a6989b3d682e8b5c9c95bb4875410 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 6 Jul 2023 12:14:43 +0200 Subject: [PATCH 040/173] improve config argument checking --- ppanggolin/utils.py | 58 +++++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index 52f6f4d5..2aa18b62 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -618,12 +618,16 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ f"{len(params_that_differ)} {subcommand} parameters have non-default value: {params_that_differ_str}") # manage workflow command + workflow_steps = [] if subcommand in WORKFLOW_SUBCOMMANDS: - for workflow_step in ALL_WORKFLOW_DEPENDENCIES: + + workflow_steps = [wf_step for wf_step in ALL_WORKFLOW_DEPENDENCIES if not (wf_step in ["rgp", "spot"] and subcommand in ["workflow", "panmodule"]) or \ + not (wf_step == "module" and subcommand in ["workflow", "panmodule"])] + + for workflow_step in workflow_steps: if (workflow_step in ["rgp", "spot"] and subcommand in ["workflow", "panmodule"]) or \ (workflow_step == "module" and subcommand in ["workflow", "panmodule"]): continue - logging.getLogger("PPanGGOLiN").debug(f'Parsing {workflow_step} arguments in config file.') step_subparser = subcommand_to_subparser[workflow_step] @@ -665,43 +669,45 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ # manage projection step parameters elif subcommand == "projection" : - for projection_step in ["annotate", "cluster", "graph", "partition", "rarefaction", "rgp", "spot", "module"]: + + projection_step = "annotate" + workflow_steps = [projection_step] + logging.getLogger().debug(f'Parsing {projection_step} arguments in config file.') + step_subparser = subcommand_to_subparser[projection_step] - logging.getLogger().debug(f'Parsing {projection_step} arguments in config file.') - step_subparser = subcommand_to_subparser[projection_step] + default_step_args = get_default_args(projection_step, step_subparser, unwanted_args=all_unspecific_params) - default_step_args = get_default_args(projection_step, step_subparser, unwanted_args=all_unspecific_params) + # remove general args + all_param_names = {arg_name for arg_name in dir(default_step_args) if not arg_name.startswith('_')} + specific_step_params = {param_name for param_name in all_param_names if + param_name not in all_unspecific_params} + + config_step_args = get_config_args(projection_step, step_subparser, config, projection_step, + specific_step_params, strict_config_check=True) - # remove general args - all_param_names = {arg_name for arg_name in dir(default_step_args) if not arg_name.startswith('_')} - specific_step_params = {param_name for param_name in all_param_names if - param_name not in all_unspecific_params} - config_step_args = get_config_args(projection_step, step_subparser, config, projection_step, - specific_step_params, strict_config_check=True) + step_args = overwrite_args(default_step_args, config_step_args, cli_args) - step_args = overwrite_args(default_step_args, config_step_args, cli_args) + step_params_that_differ = get_args_that_differe_from_default(default_step_args, step_args) - step_params_that_differ = get_args_that_differe_from_default(default_step_args, step_args) - - if step_params_that_differ: - step_params_that_differ_str = ', '.join([f'{p}={v}' for p, v in step_params_that_differ.items()]) - logging.getLogger().debug( - f"{len(step_params_that_differ)} {projection_step} parameters have a non-default value: {step_params_that_differ_str}") + if step_params_that_differ: + step_params_that_differ_str = ', '.join([f'{p}={v}' for p, v in step_params_that_differ.items()]) + logging.getLogger().debug( + f"{len(step_params_that_differ)} {projection_step} parameters have a non-default value: {step_params_that_differ_str}") - # add step name to differentiate the params - step_params_that_differ = {f'{projection_step}:{param}': value for param, value in - step_params_that_differ.items()} + # add step name to differentiate the params + step_params_that_differ = {f'{projection_step}:{param}': value for param, value in + step_params_that_differ.items()} - params_that_differ.update(step_params_that_differ) + params_that_differ.update(step_params_that_differ) - # Add args namespace of the step to the inital args namespace - setattr(args, projection_step, step_args) + # Add args namespace of the step to the inital args namespace + setattr(args, projection_step, step_args) if params_that_differ: logging.getLogger("PPanGGOLiN").info(f'{len(params_that_differ)} parameters have a non-default value.') - check_config_consistency(config, ALL_WORKFLOW_DEPENDENCIES) + check_config_consistency(config, workflow_steps) return args From 94c0eda1e4bcdaa0f809d2537092f1f217e66ed1 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 6 Jul 2023 13:48:11 +0200 Subject: [PATCH 041/173] fix Path of tmpdir in context --- ppanggolin/context/searchGeneContext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 224e5ed5..cb869de9 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -54,7 +54,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: if sequences is not None: # Alignment of sequences on pangenome families new_tmpdir = tempfile.TemporaryDirectory(dir=tmpdir) - seq_set, _, seq2pan = get_seq2pang(pangenome, sequences, output, new_tmpdir, cpu, no_defrag, identity, + seq_set, _, seq2pan = get_seq2pang(pangenome, sequences, output, Path(new_tmpdir.name), cpu, no_defrag, identity, coverage) project_and_write_partition(seq2pan, seq_set, output) new_tmpdir.cleanup() From 2e3e63f8711fa0cc20b7b9b716e559dafe32dd89 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 6 Jul 2023 18:18:07 +0200 Subject: [PATCH 042/173] homogenize argparse param name and stored param in pangenome for annotate, rgp, module and spot --- ppanggolin/RGP/genomicIsland.py | 12 +++--- ppanggolin/RGP/spot.py | 12 +++--- ppanggolin/align/alignOnPang.py | 10 ++--- ppanggolin/annotate/annotate.py | 21 ++++++----- ppanggolin/figures/draw_spot.py | 8 ++-- ppanggolin/formats/readBinaries.py | 26 +++++++++---- ppanggolin/formats/writeFlat.py | 2 +- ppanggolin/info/info.py | 59 +++++++++++++++--------------- ppanggolin/mod/module.py | 12 +++--- 9 files changed, 87 insertions(+), 75 deletions(-) diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index 6e5233ba..30ff9480 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -287,12 +287,12 @@ def predict_rgp(pangenome: Pangenome, persistent_penalty: int = 3, variable_gain logging.getLogger("PPanGGOLiN").info(f"Predicted {len(pangenome.regions)} RGP") # save parameters and save status - pangenome.parameters["RGP"] = {} - pangenome.parameters["RGP"]["persistent_penalty"] = persistent_penalty - pangenome.parameters["RGP"]["variable_gain"] = variable_gain - pangenome.parameters["RGP"]["min_length"] = min_length - pangenome.parameters["RGP"]["min_score"] = min_score - pangenome.parameters["RGP"]["dup_margin"] = dup_margin + pangenome.parameters["rgp"] = {} + pangenome.parameters["rgp"]["persistent_penalty"] = persistent_penalty + pangenome.parameters["rgp"]["variable_gain"] = variable_gain + pangenome.parameters["rgp"]["min_length"] = min_length + pangenome.parameters["rgp"]["min_score"] = min_score + pangenome.parameters["rgp"]["dup_margin"] = dup_margin pangenome.status['predictedRGP'] = "Computed" diff --git a/ppanggolin/RGP/spot.py b/ppanggolin/RGP/spot.py index 32f31b88..f54722da 100644 --- a/ppanggolin/RGP/spot.py +++ b/ppanggolin/RGP/spot.py @@ -142,7 +142,7 @@ def add_new_node(g: nx.Graph, region: Region, borders: list): del graph_spot.nodes[node]["border0"] del graph_spot.nodes[node]["border1"] del graph_spot.nodes[node]["rgp"] - + nx.readwrite.gexf.write_gexf(graph_spot, output / "spotGraph.gexf") nx.readwrite.graphml.write_graphml(graph_spot, output / "spotGraph.graphml") return spots @@ -191,7 +191,7 @@ def predict_hotspots(pangenome: Pangenome, output: Path, spot_graph: bool = Fals # get multigenic gene families logging.getLogger("PPanGGOLiN").info("Detecting multigenic families...") - multigenics = pangenome.get_multigenics(pangenome.parameters["RGP"]["dup_margin"]) + multigenics = pangenome.get_multigenics(pangenome.parameters["rgp"]["dup_margin"]) logging.getLogger("PPanGGOLiN").info("Detecting hotspots in the pangenome...") @@ -206,10 +206,10 @@ def predict_hotspots(pangenome: Pangenome, output: Path, spot_graph: bool = Fals pangenome.add_spots(spots) pangenome.status["spots"] = "Computed" - pangenome.parameters["spots"] = {} - pangenome.parameters["spots"]["set_size"] = set_size - pangenome.parameters["spots"]["overlapping_match"] = overlapping_match - pangenome.parameters["spots"]["exact_match"] = exact_match + pangenome.parameters["spot"] = {} + pangenome.parameters["spot"]["set_size"] = set_size + pangenome.parameters["spot"]["overlapping_match"] = overlapping_match + pangenome.parameters["spot"]["exact_match_size"] = exact_match def launch(args: argparse.Namespace): diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 12d00f14..19a9f9e8 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -181,7 +181,7 @@ def get_fam_to_rgp(pangenome, multigenics: set) -> dict: for rgp in pangenome.regions: for fam in rgp.families: fam2rgp[fam].append(rgp.name) - for fam in [gene.family for border in rgp.get_bordering_genes(pangenome.parameters["spots"]["set_size"], + for fam in [gene.family for border in rgp.get_bordering_genes(pangenome.parameters["spot"]["set_size"], multigenics) for gene in border]: fam2rgp[fam].append(rgp.name) return fam2rgp @@ -206,7 +206,7 @@ def get_fam_to_spot(pangenome: Pangenome, multigenics: Set[GeneFamily]) \ for rgp in spot.regions: fams |= rgp.families fams_border |= set([gene.family for border in # Set of families in border of spot - rgp.get_bordering_genes(pangenome.parameters["spots"]["set_size"], multigenics) + rgp.get_bordering_genes(pangenome.parameters["spot"]["set_size"], multigenics) for gene in border]) for fam in fams: fam2spot[fam].append(spot) @@ -254,7 +254,7 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel :return: """ logging.getLogger("PPanGGOLiN").info("Writing RGP and spot information related to hits in the pangenome") - multigenics = pangenome.get_multigenics(pangenome.parameters["RGP"]["dup_margin"]) + multigenics = pangenome.get_multigenics(pangenome.parameters["rgp"]["dup_margin"]) finfo = open(output / "info_input_seq.tsv", "w") finfo.write("input\tfamily\tpartition\tspot_list_as_member\tspot_list_as_border\trgp_list\n") @@ -275,8 +275,8 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel drawn_spots.add(spot) logging.getLogger("PPanGGOLiN").info(f"Drawing the {len(drawn_spots)} spots with more than 1 organization " f"related to hits of the input sequences...") - draw_selected_spots(drawn_spots, pangenome, output, pangenome.parameters["spots"]["overlapping_match"], - pangenome.parameters["spots"]["exact_match"], pangenome.parameters["spots"]["set_size"], + draw_selected_spots(drawn_spots, pangenome, output, pangenome.parameters["spot"]["overlapping_match"], + pangenome.parameters["spot"]["exact_match_size"], pangenome.parameters["spot"]["set_size"], disable_bar=disable_bar) fam2mod = {} # fam2module diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 337836f4..3db04d81 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -468,10 +468,10 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p "PPanGGOLiN will use self-generated identifiers.") pangenome.status["genomesAnnotated"] = "Computed" - pangenome.parameters["annotation"] = {} - pangenome.parameters["annotation"]["used_local_identifiers"] = used_local_identifiers - pangenome.parameters["annotation"]["read_pseudogenes"] = pseudo - pangenome.parameters["annotation"]["read_annotations_from_file"] = True + pangenome.parameters["annotate"] = {} + pangenome.parameters["annotate"]["used_local_identifiers"] = used_local_identifiers + pangenome.parameters["annotate"]["use_pseudo"] = pseudo + pangenome.parameters["annotate"]["read_annotations_from_file"] = True def get_gene_sequences_from_fastas(pangenome, fasta_file): @@ -569,12 +569,13 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: logging.getLogger("PPanGGOLiN").info("Done annotating genomes") pangenome.status["genomesAnnotated"] = "Computed" # the pangenome is now annotated. pangenome.status["geneSequences"] = "Computed" # the gene objects have their respective gene sequences. - pangenome.parameters["annotation"] = {} - pangenome.parameters["annotation"]["remove_Overlapping_CDS"] = overlap - pangenome.parameters["annotation"]["annotate_RNA"] = True if not norna else False - pangenome.parameters["annotation"]["kingdom"] = kingdom - pangenome.parameters["annotation"]["translation_table"] = translation_table - pangenome.parameters["annotation"]["read_annotations_from_file"] = False + pangenome.parameters["annotate"] = {} + pangenome.parameters["annotate"]["allow_overlap"] = overlap + pangenome.parameters["annotate"]["norna"] = norna + pangenome.parameters["annotate"]["kingdom"] = kingdom + pangenome.parameters["annotate"]["translation_table"] = translation_table + pangenome.parameters["annotate"]["prodigal_procedure"] = procedure + pangenome.parameters["annotate"]["read_annotations_from_file"] = False def launch(args: argparse.Namespace): diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index e0403bcb..6029501c 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -566,7 +566,7 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome: logging.getLogger("PPanGGOLiN").info("Ordering genes among regions, and drawing spots...") - multigenics = pangenome.get_multigenics(pangenome.parameters["RGP"]["dup_margin"]) + multigenics = pangenome.get_multigenics(pangenome.parameters["rgp"]["dup_margin"]) fam2mod = {} for mod in pangenome.modules: @@ -666,6 +666,6 @@ def draw_spots(pangenome: Pangenome, output: Path, spot_list: str, disable_bar: logging.getLogger("PPanGGOLiN").info(f"Drawing {len(selected_spots)} spots") draw_selected_spots(selected_spots, pangenome, output, - overlapping_match=pangenome.parameters["spots"]["overlapping_match"], - exact_match=pangenome.parameters["spots"]["exact_match"], - set_size=pangenome.parameters["spots"]["set_size"], disable_bar=disable_bar) + overlapping_match=pangenome.parameters["spot"]["overlapping_match"], + exact_match=pangenome.parameters["spot"]["exact_match_size"], + set_size=pangenome.parameters["spot"]["set_size"], disable_bar=disable_bar) diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 93ee2d83..0a5407e6 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -5,9 +5,9 @@ import logging import sys from pathlib import Path -# installed libraries -from typing import TextIO +from typing import TextIO, Dict, Any +# installed libraries from tables import Table from tqdm import tqdm import tables @@ -528,19 +528,31 @@ def read_modules_info(h5f: tables.File): f"mean: {info_group._v_attrs['StatOfFamiliesInModules']['mean']}") -def read_parameters(h5f: tables.File): +def print_pangenome_parameters(h5f: tables.File): """ Read pangenome parameters :param h5f: Pangenome HDF5 file """ + step_to_parameters = get_pangenome_parameters(h5f) + + for step, param_name_to_value in step_to_parameters.items(): + print(f"{step}:") + for param_name, val in param_name_to_value.items(): + print(f" {param_name} : {val}") + +def get_pangenome_parameters(h5f: tables.File) -> Dict[str, Dict[str, Any]]: + """ + Read and return the pangenome parameters. + + :param h5f: Pangenome HDF5 file + :return: A dictionary containing the name of the ppanggolin step as the key, and a dictionary of parameter names + and their corresponding values used for that step. + """ if "/info" in h5f: info_group = h5f.root.info if "parameters" in info_group._v_attrs._f_list(): - for key, dic in info_group._v_attrs["parameters"].items(): - print(f"{key}") - for key2, val in dic.items(): - print(f" {key2} : {val}") + return info_group._v_attrs["parameters"] def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = False, graph: bool = False, diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 7bca515e..ab32ab90 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -729,7 +729,7 @@ def write_borders(output: Path, dup_margin: float = 0.05, compress: bool = False with write_compressed_or_not(output / "spot_borders.tsv", compress) as fout: fout.write("spot_id\tnumber\tborder1\tborder2\n") for spot in sorted(pan.spots, key=lambda x: len(x.regions), reverse=True): - curr_borders = spot.borders(pan.parameters["spots"]["set_size"], multigenics) + curr_borders = spot.borders(pan.parameters["spot"]["set_size"], multigenics) for c, border in curr_borders: famstring1 = ",".join([fam.name for fam in border[0]]) famstring2 = ",".join([fam.name for fam in border[1]]) diff --git a/ppanggolin/info/info.py b/ppanggolin/info/info.py index 97e81f85..6e894d2f 100644 --- a/ppanggolin/info/info.py +++ b/ppanggolin/info/info.py @@ -9,7 +9,7 @@ import tables # local libraries -from ppanggolin.formats import read_info, read_parameters, fix_partitioned +from ppanggolin.formats import read_info, print_pangenome_parameters, fix_partitioned def print_info(pangenome: Path, status: bool = False, content: bool = False, parameters: bool = False): @@ -23,35 +23,34 @@ def print_info(pangenome: Path, status: bool = False, content: bool = False, par """ fix_partitioned(pangenome) if status or content or parameters: - h5f = tables.open_file(pangenome, "r+") - if status: - status_group = h5f.root.status - print(f"genomes annotated : {'true' if status_group._v_attrs.genomesAnnotated else 'false'}") - print(f"genes clustered : {'true' if status_group._v_attrs.genesClustered else 'false'}") - print(f"genes have their sequences : {'true' if status_group._v_attrs.geneSequences else 'false'}") - print(f"gene families have their sequences : " - f"{'true' if status_group._v_attrs.geneFamilySequences else 'false'}") - print(f"neighbors graph : {'true' if status_group._v_attrs.NeighborsGraph else 'false'}") - if status_group._v_attrs.Partitioned: - print("pangenome partitioned : true") - else: - print("pangenome partitioned : false") - if hasattr(status_group._v_attrs, "predictedRGP"): - print(f"RGP predicted : {'true' if status_group._v_attrs.predictedRGP else 'false'}") - - if hasattr(status_group._v_attrs, "spots"): - print(f"Spots predicted : {'true' if status_group._v_attrs.spots else 'false'}") - - if hasattr(status_group._v_attrs, "modules"): - print(f"Modules predicted : {'true' if status_group._v_attrs.modules else 'false'}") - - if hasattr(status_group._v_attrs, "version"): - print(f"PPanGGOLiN version : {status_group._v_attrs.version}") - if content: - read_info(h5f) - if parameters: - read_parameters(h5f) - h5f.close() + with tables.open_file(pangenome, "r+") as h5f: + if status: + status_group = h5f.root.status + print(f"genomes annotated : {'true' if status_group._v_attrs.genomesAnnotated else 'false'}") + print(f"genes clustered : {'true' if status_group._v_attrs.genesClustered else 'false'}") + print(f"genes have their sequences : {'true' if status_group._v_attrs.geneSequences else 'false'}") + print(f"gene families have their sequences : " + f"{'true' if status_group._v_attrs.geneFamilySequences else 'false'}") + print(f"neighbors graph : {'true' if status_group._v_attrs.NeighborsGraph else 'false'}") + if status_group._v_attrs.Partitioned: + print("pangenome partitioned : true") + else: + print("pangenome partitioned : false") + if hasattr(status_group._v_attrs, "predictedRGP"): + print(f"RGP predicted : {'true' if status_group._v_attrs.predictedRGP else 'false'}") + + if hasattr(status_group._v_attrs, "spots"): + print(f"Spots predicted : {'true' if status_group._v_attrs.spots else 'false'}") + + if hasattr(status_group._v_attrs, "modules"): + print(f"Modules predicted : {'true' if status_group._v_attrs.modules else 'false'}") + + if hasattr(status_group._v_attrs, "version"): + print(f"PPanGGOLiN version : {status_group._v_attrs.version}") + if content: + read_info(h5f) + if parameters: + print_pangenome_parameters(h5f) else: print("Please select what information you want by using --parameters, --content or --status") diff --git a/ppanggolin/mod/module.py b/ppanggolin/mod/module.py index b8b09c07..2058fbb2 100644 --- a/ppanggolin/mod/module.py +++ b/ppanggolin/mod/module.py @@ -130,12 +130,12 @@ def predict_modules(pangenome: Pangenome, dup_margin: float = 0.05, size: int = pangenome.add_modules(modules) pangenome.status["modules"] = "Computed" - pangenome.parameters["modules"] = {} - pangenome.parameters["modules"]["size"] = size - pangenome.parameters["modules"]["min_presence"] = min_presence - pangenome.parameters["modules"]["transitive"] = transitive - pangenome.parameters["modules"]["jaccard"] = jaccard - pangenome.parameters["modules"]["dup_margin"] = dup_margin + pangenome.parameters["module"] = {} + pangenome.parameters["module"]["size"] = size + pangenome.parameters["module"]["min_presence"] = min_presence + pangenome.parameters["module"]["transitive"] = transitive + pangenome.parameters["module"]["jaccard"] = jaccard + pangenome.parameters["module"]["dup_margin"] = dup_margin def launch(args: argparse.Namespace): From 882c09ef6ec22249ace78505d443f1f8081797af Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 7 Jul 2023 10:24:18 +0200 Subject: [PATCH 043/173] standardized all parameters name stored in pan to match argparse name --- ppanggolin/annotate/annotate.py | 6 +- ppanggolin/cluster/cluster.py | 14 +-- ppanggolin/graph/makeGraph.py | 6 +- ppanggolin/nem/partition.py | 13 ++- ppanggolin/nem/rarefaction.py | 6 +- ppanggolin/projection/projection.py | 145 ++++++++++++++++------------ 6 files changed, 111 insertions(+), 79 deletions(-) diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 3db04d81..b2317f1b 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -469,7 +469,7 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p pangenome.status["genomesAnnotated"] = "Computed" pangenome.parameters["annotate"] = {} - pangenome.parameters["annotate"]["used_local_identifiers"] = used_local_identifiers + pangenome.parameters["annotate"]["# used_local_identifiers"] = used_local_identifiers pangenome.parameters["annotate"]["use_pseudo"] = pseudo pangenome.parameters["annotate"]["read_annotations_from_file"] = True @@ -574,8 +574,8 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: pangenome.parameters["annotate"]["norna"] = norna pangenome.parameters["annotate"]["kingdom"] = kingdom pangenome.parameters["annotate"]["translation_table"] = translation_table - pangenome.parameters["annotate"]["prodigal_procedure"] = procedure - pangenome.parameters["annotate"]["read_annotations_from_file"] = False + pangenome.parameters["annotate"]["prodigal_procedure"] = False if procedure is None else procedure + pangenome.parameters["annotate"]["# read_annotations_from_file"] = False def launch(args: argparse.Namespace): diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 86fc7d62..1514d4d4 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -316,9 +316,11 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool = pangenome.parameters["cluster"] = {} pangenome.parameters["cluster"]["coverage"] = coverage pangenome.parameters["cluster"]["identity"] = identity - pangenome.parameters["cluster"]["defragmentation"] = defrag + pangenome.parameters["cluster"]["# defragmentation"] = defrag + pangenome.parameters["cluster"]["no_defrag"] = not defrag + pangenome.parameters["cluster"]["translation_table"] = code - pangenome.parameters["cluster"]["read_clustering_from_file"] = False + pangenome.parameters["cluster"]["# read_clustering_from_file"] = False # Read clustering @@ -334,12 +336,12 @@ def mk_local_to_gene(pangenome: Pangenome) -> dict: old_len = len(local_dict) local_dict[gene.local_identifier] = gene if len(local_dict) == old_len: - if pangenome.parameters["annotation"]["read_annotations_from_file"] and not \ - pangenome.parameters["annotation"]["used_local_identifiers"]: + if pangenome.parameters["annotate"]["# read_annotations_from_file"] and not \ + pangenome.parameters["annotate"]["# used_local_identifiers"]: raise Exception(f"'{gene.local_identifier}' was found multiple times used as an identifier. " f"The identifier of the genes (locus_tag, protein_id in gbff, ID in gff) were not " f"unique throughout all of the files. It is thus impossible to differentiate the genes." - f" To use this function while importing annotation, all identifiers MUST be unique " + f" To use this function while importing annotate, all identifiers MUST be unique " f"throughout all of your genomes") return {} # local identifiers are not unique. return local_dict @@ -421,7 +423,7 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet if frag: # if there was fragment information in the file. pangenome.status["defragmented"] = "Computed" pangenome.parameters["cluster"] = {} - pangenome.parameters["cluster"]["read_clustering_from_file"] = True + pangenome.parameters["cluster"]["# read_clustering_from_file"] = True pangenome.parameters["cluster"]["infer_singletons"] = infer_singleton diff --git a/ppanggolin/graph/makeGraph.py b/ppanggolin/graph/makeGraph.py index 10e8bdfc..66187440 100644 --- a/ppanggolin/graph/makeGraph.py +++ b/ppanggolin/graph/makeGraph.py @@ -110,10 +110,10 @@ def compute_neighbors_graph(pangenome: Pangenome, remove_copy_number: int = 0, pangenome.status["neighborsGraph"] = "Computed" pangenome.parameters["graph"] = {} - pangenome.parameters["graph"]["removed_high_copy_number_families"] = False + # pangenome.parameters["graph"]["removed_high_copy_number_families"] = False if remove_copy_number > 0: - pangenome.parameters["graph"]["removed_high_copy_number_families"] = True - pangenome.parameters["graph"]["removed_high_copy_number_of_families_above"] = remove_copy_number + # pangenome.parameters["graph"]["removed_high_copy_number_families"] = True + pangenome.parameters["graph"]["remove_high_copy_number"] = remove_copy_number def launch(args: argparse.Namespace): diff --git a/ppanggolin/nem/partition.py b/ppanggolin/nem/partition.py index eccde1b1..8094ebaf 100644 --- a/ppanggolin/nem/partition.py +++ b/ppanggolin/nem/partition.py @@ -478,20 +478,25 @@ def partition(pangenome: Pangenome, output: Path = None, beta: float = 2.5, sm_d pangenome.parameters["partition"] = {} pangenome.parameters["partition"]["beta"] = beta + pangenome.parameters["partition"]["max_degree_smoothing"] = sm_degree pangenome.parameters["partition"]["free_dispersion"] = free_dispersion - pangenome.parameters["partition"]["max_node_degree_for_smoothing"] = sm_degree + pangenome.parameters["partition"]["ICL_margin"] = icl_margin + pangenome.parameters["partition"]["seed"] = seed if len(organisms) > chunk_size: pangenome.parameters["partition"]["chunk_size"] = chunk_size - pangenome.parameters["partition"]["computed_K"] = False + pangenome.parameters["partition"]["# computed nb of partitions"] = False + # the K value initally given by the user + pangenome.parameters["partition"]["nb_of_partitions"] = kval if kval < 2: - pangenome.parameters["partition"]["computed_K"] = True + pangenome.parameters["partition"]["# computed nb of partitions"] = True logging.getLogger("PPanGGOLiN").info("Estimating the optimal number of partitions...") kval = evaluate_nb_partitions(organisms, output, sm_degree, free_dispersion, chunk_size, kmm, icl_margin, draw_icl, cpu, seed, tmp_path, disable_bar) logging.getLogger("PPanGGOLiN").info(f"The number of partitions has been evaluated at {kval}") - pangenome.parameters["partition"]["K"] = kval + pangenome.parameters["partition"]["# final nb of partitions"] = kval + pangenome.parameters["partition"]["krange"] = kmm init = "param_file" partitioning_results = {} diff --git a/ppanggolin/nem/rarefaction.py b/ppanggolin/nem/rarefaction.py index d13ea370..c2a5363a 100644 --- a/ppanggolin/nem/rarefaction.py +++ b/ppanggolin/nem/rarefaction.py @@ -362,8 +362,8 @@ def make_rarefaction_curve(pangenome: Pangenome, output: Path, tmpdir: Path = No ppp.pan = pangenome # use the global from partition to store the pangenome, so that it is usable try: - krange[0] = ppp.pan.parameters["partition"]["K"] if krange[0] < 0 else krange[0] - krange[1] = ppp.pan.parameters["partition"]["K"] if krange[1] < 0 else krange[1] + krange[0] = ppp.pan.parameters["partition"]["# final nb of partitions"] if krange[0] < 0 else krange[0] + krange[1] = ppp.pan.parameters["partition"]["# final nb of partitions"] if krange[1] < 0 else krange[1] except KeyError: krange = [3, 20] check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=True, disable_bar=disable_bar) @@ -378,7 +378,7 @@ def make_rarefaction_curve(pangenome: Pangenome, output: Path, tmpdir: Path = No if kval < 3 and kestimate is False: # estimate K once and for all. try: - kval = ppp.pan.parameters["partition"]["K"] + kval = ppp.pan.parameters["partition"]["# final nb of partitions"] logging.getLogger("PPanGGOLiN").info(f"Reuse the number of partitions {kval}") except KeyError: logging.getLogger("PPanGGOLiN").info("Estimating the number of partitions...") diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index cd70b4a8..ebbdbaf1 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -10,7 +10,6 @@ from pathlib import Path import tempfile from typing import Tuple, Set, Dict, Iterator - # installed libraries from tqdm import tqdm @@ -23,11 +22,10 @@ from ppanggolin.utils import read_compressed_or_not, write_compressed_or_not, mk_file_name, min_one, restricted_float, mk_outdir from ppanggolin.align.alignOnPang import get_seq2pang, project_and_write_partition from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations -from ppanggolin.formats import check_pangenome_info +from ppanggolin.formats.readBinaries import get_pangenome_parameters, check_pangenome_info # from ppanggolin.formats import write_pangenome from ppanggolin.RGP.genomicIsland import naming_scheme, compute_org_rgp from ppanggolin.RGP.spot import make_spot_graph -# from ppanggolin.formats.readBinaries import retrieve_pangenome_parameters from ppanggolin.genome import Organism, Gene, RNA, Contig from ppanggolin.geneFamily import GeneFamily from ppanggolin.region import Region @@ -151,7 +149,22 @@ def launch(args: argparse.Namespace): :param args: All arguments provide by user """ - + + # For the moment this element of the pangenome are predicted by default + project_modules = True + predict_rgp = True + project_spots = True + # TODO : check that the different elements have been predicted in the pangenome. if not need to define a behavior... + # load pangenome + + pangenome = Pangenome() + pangenome.add_file(args.pangenome) + + check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar, + need_rgp=predict_rgp, need_modules=project_modules, + need_spots=project_spots) + + pangenome_params = argparse.Namespace(**{step:argparse.Namespace(**k_v) for step, k_v in pangenome.parameters.items()}) output_dir = Path(args.output) mk_outdir(output_dir, args.force) @@ -160,15 +173,15 @@ def launch(args: argparse.Namespace): # if so add a warning or error # TODO some params are no keep in pangenome... like use_pseudo. what to do? - logging.getLogger().info('Retrieving pangenome parameters from the provided pangenome file.') - + # with tables.open_file(pangenome, "r+"): + # pangenome_parameter = get_pangenome_parameters(h5f) if args.annot_file is not None: # read_annotations(pangenome, args.anno, cpu=args.cpu, pseudo=args.use_pseudo, disable_bar=args.disable_prog_bar) input_organism, has_sequence = read_anno_file(organism_name = args.organism_name, filename=args.annot_file, circular_contigs=[], - pseudo=False) + pseudo=False) if not has_sequence: if args.fasta_file: @@ -186,42 +199,37 @@ def launch(args: argparse.Namespace): raise Exception("At least one of --fasta_file or --anno_file must be given") - # load pangenome - - pangenome = Pangenome() - pangenome.add_file(args.pangenome) - - check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar, - need_rgp=args.predict_rgp, need_modules=args.project_modules, - need_spots=args.project_spots) # Add input organism in pangenome. This temporary as pangenome is not going to be written. pangenome.add_organism(input_organism) - annotate_input_genes_with_pangenome_families(pangenome, input_organism=input_organism, output=output_dir, cpu=args.cluster.cpu, - no_defrag = args.cluster.no_defrag, identity = args.cluster.identity, coverage = args.cluster.coverage, tmpdir=args.tmpdir, - disable_bar=args.disable_prog_bar, translation_table = args.annotate.translation_table ) + annotate_input_genes_with_pangenome_families(pangenome, input_organism=input_organism, output=output_dir, cpu=args.cpu, + no_defrag = args.no_defrag, identity = args.identity, coverage = args.coverage, tmpdir=args.tmpdir, + disable_bar=args.disable_prog_bar, translation_table = args.translation_table) + if predict_rgp: + logging.getLogger().info('Detecting rgp in input genome.') + logging.getLogger().info('Retrieving rgp parameters from the provided pangenome file.') - if args.predict_rgp: logging.getLogger().info("Detecting multigenic families...") - multigenics = pangenome.get_multigenics(args.rgp.dup_margin) + multigenics = pangenome.get_multigenics(pangenome_params.rgp.dup_margin) - input_org_rgps = predict_RGP(pangenome, input_organism, persistent_penalty=args.rgp.persistent_penalty, variable_gain=args.rgp.variable_gain, - min_length=args.rgp.min_length, min_score=args.rgp.min_score, multigenics=multigenics, + input_org_rgps = predict_RGP(pangenome, input_organism, persistent_penalty=pangenome_params.rgp.persistent_penalty, variable_gain=pangenome_params.rgp.variable_gain, + min_length=pangenome_params.rgp.min_length, min_score=pangenome_params.rgp.min_score, multigenics=multigenics, disable_bar=args.disable_prog_bar) + all_rgps = list(input_org_rgps) + pangenome.regions write_predicted_regions(input_org_rgps, output=output_dir) - spots = predict_spots(all_rgps, multigenics, output=output_dir, spot_graph=args.spot.spot_graph, - overlapping_match=args.spot.overlapping_match, set_size=args.spot.set_size, - exact_match=args.spot.exact_match_size) + spots = predict_spots(all_rgps, multigenics, output=output_dir, spot_graph=False,#args.spot.spot_graph, + overlapping_match=pangenome_params.spot.overlapping_match, set_size=pangenome_params.spot.set_size, + exact_match=pangenome_params.spot.exact_match_size) - if args.project_modules: - write_projected_modules_to_input_organism(pangenome, input_organism, output_dir) + if project_modules: + projetc_and_write_modules(pangenome, input_organism, output_dir) # write_flat_files_for_input_genome(input_organism) @@ -247,8 +255,9 @@ def predict_spots(rgps: list, multigenics: set, output: str, overlapping_match=overlapping_match, set_size=set_size, exact_match=exact_match) -def write_projected_modules_to_input_organism(pangenome, input_organism, output, compress=False): - """Write a tsv file providing association between modules and organisms +def projetc_and_write_modules(pangenome, input_organism, output, compress=False): + """ + Write a tsv file providing association between modules and the input organism :param output: Path to output directory :param compress: Compress the file in .gz @@ -320,43 +329,59 @@ def parser_projection(parser: argparse.ArgumentParser): time.localtime()) + "_PID" + str(os.getpid()), help="Output directory") - optional.add_argument('--predict_rgp', required=False, action='store_true', default=False, - help="Predict RGPs and hot spots on the input genome.") - optional.add_argument('--project_modules', required=False, action='store_true', default=False, - help="Project pangenome modules to the input genome.") - optional.add_argument('--project_spots', required=False, action='store_true', default=False, - help="Project pangenome spots to the input genome.") + # optional.add_argument('--rgp', required=False, action='store_true', default=False, + # help="Predict RGPs and hot spots on the input genome.") + # optional.add_argument('--module', required=False, action='store_true', default=False, + # help="Project pangenome modules to the input genome.") + # optional.add_argument('--spots', required=False, action='store_true', default=False, + # help="Project pangenome spots to the input genome.") + optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") # optional.add_argument("--basename", required=False, default="pangenome", help="basename for the output file") # annotate = parser.add_argument_group(title="Annotation arguments") - - # annotate.add_argument('--allow_overlap', required=False, action='store_true', default=False, - # help="Use to not remove genes overlapping with RNA features.") - # annotate.add_argument("--norna", required=False, action="store_true", default=False, - # help="Use to avoid annotating RNA features.") - # annotate.add_argument("--kingdom", required=False, type=str.lower, default="bacteria", - # choices=["bacteria", "archaea"], - # help="Kingdom to which the prokaryota belongs to, " - # "to know which models to use for rRNA annotation.") - # annotate.add_argument("--translation_table", required=False, type=int, default=11, + optional.add_argument('--allow_overlap', required=False, action='store_true', default=False, + help="Use to not remove genes overlapping with RNA features.") + optional.add_argument("--norna", required=False, action="store_true", default=False, + help="Use to avoid annotating RNA features.") + optional.add_argument("--kingdom", required=False, type=str.lower, default="bacteria", + choices=["bacteria", "archaea"], + help="Kingdom to which the prokaryota belongs to, " + "to know which models to use for rRNA annotation.") + # optional.add_argument("--translation_table", required=False, type=int, default=11, # help="Translation table (genetic code) to use.") - # annotate.add_argument("--prodigal_procedure", required=False, type=str.lower, choices=["single", "meta"], - # default=None, help="Allow to force the prodigal procedure. " - # "If nothing given, PPanGGOLiN will decide in function of contig length") - # annotate.add_argument("--use_pseudo", required=False, action="store_true", - # help="In the context of provided annotation, use this option to read pseudogenes. " - # "(Default behavior is to ignore them)") - - # cluster = parser.add_argument_group(title="Clustering arguments") - # cluster.add_argument('--no_defrag', required=False, action="store_true", - # help="DO NOT Realign gene families to link fragments with" - # "their non-fragmented gene family.") - # cluster.add_argument('--identity', required=False, type=float, default=0.5, - # help="min identity percentage threshold") - # cluster.add_argument('--coverage', required=False, type=float, default=0.8, - # help="min coverage percentage threshold") + optional.add_argument("--prodigal_procedure", required=False, type=str.lower, choices=["single", "meta"], + default=None, help="Allow to force the prodigal procedure. " + "If nothing given, PPanGGOLiN will decide in function of contig length") + + optional.add_argument('--no_defrag', required=False, action="store_true", + help="DO NOT Realign gene families to link fragments with" + "their non-fragmented gene family. (default: False)") + + optional.add_argument('--identity', required=False, type=float, default=0.5, + help="min identity percentage threshold") + + optional.add_argument('--coverage', required=False, type=float, default=0.8, + help="min coverage percentage threshold") + + optional.add_argument("--translation_table", required=False, default="11", + help="Translation table (genetic code) to use.") + + # optional.add_argument("--getinfo", required=False, action="store_true", + # help="Use this option to extract info related to the best hit of each query, " + # "such as the RGP it is in, or the spots.") + + # optional.add_argument("--draw_related", required=False, action="store_true", + # help="Draw figures and provide graphs in a gexf format of the eventual spots" + # " associated to the input sequences") + + # but does not use the option + optional.add_argument("--use_pseudo", required=False, action="store_true", + help="In the context of provided annotation, use this option to read pseudogenes. " + "(Default behavior is to ignore them)") + optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") + \ No newline at end of file From 3fe456b3df614e9967e94503d24f8ff5ec8c62b6 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 7 Jul 2023 10:54:35 +0200 Subject: [PATCH 044/173] fix param name --- ppanggolin/annotate/annotate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index b2317f1b..1090f65d 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -471,7 +471,7 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p pangenome.parameters["annotate"] = {} pangenome.parameters["annotate"]["# used_local_identifiers"] = used_local_identifiers pangenome.parameters["annotate"]["use_pseudo"] = pseudo - pangenome.parameters["annotate"]["read_annotations_from_file"] = True + pangenome.parameters["annotate"]["# read_annotations_from_file"] = True def get_gene_sequences_from_fastas(pangenome, fasta_file): From 161853d1c658c4ea1db0e98f8c709e758914497b Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 28 Jul 2023 16:02:39 +0200 Subject: [PATCH 045/173] add some logging --- ppanggolin/projection/projection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index ebbdbaf1..29ae1ca3 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -164,6 +164,7 @@ def launch(args: argparse.Namespace): need_rgp=predict_rgp, need_modules=project_modules, need_spots=project_spots) + logging.getLogger().info('Retrieving parameters from the provided pangenome file.') pangenome_params = argparse.Namespace(**{step:argparse.Namespace(**k_v) for step, k_v in pangenome.parameters.items()}) output_dir = Path(args.output) @@ -209,7 +210,6 @@ def launch(args: argparse.Namespace): if predict_rgp: logging.getLogger().info('Detecting rgp in input genome.') - logging.getLogger().info('Retrieving rgp parameters from the provided pangenome file.') logging.getLogger().info("Detecting multigenic families...") @@ -251,7 +251,7 @@ def predict_spots(rgps: list, multigenics: set, output: str, :return: list of computed spots """ - spots = make_spot_graph(rgps=rgps, multigenics=multigenics, output=output, spot_graph=spot_graph, + spots = make_spot_graph(rgps=rgps, multigenics=multigenics, output=output, spot_graph=True, overlapping_match=overlapping_match, set_size=set_size, exact_match=exact_match) From b77e5e708e49c1ace93e70f84a8cf68501797655 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 31 Jul 2023 14:54:57 +0200 Subject: [PATCH 046/173] improve log message in parsing args from cli, config and default --- ppanggolin/utils.py | 105 ++++++++++++++++++++++++++------------------ 1 file changed, 62 insertions(+), 43 deletions(-) diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index 2aa18b62..d44e00d6 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -471,14 +471,15 @@ def get_arg_name(arg_val: Union[str, TextIOWrapper]) -> Union[str, TextIOWrapper def overwrite_args(default_args: argparse.Namespace, config_args: argparse.Namespace, cli_args: argparse.Namespace): """ Overwrite args objects. - When arguments are given in CLI, their value is used instead of the one found in config. - When arguments are specified in config they overwrite default values. + + When arguments are given in CLI, their values are used instead of the ones found in the config file. + When arguments are specified in the config file, they overwrite default values. :param default_args: default arguments - :param config_args: arguments parsed from config file - :param cli_args: arguments parsed from command line + :param config_args: arguments parsed from the config file + :param cli_args: arguments parsed from the command line - :return: final arguments + :return: final arguments """ args = argparse.Namespace() all_params = [arg for arg in dir(default_args) if not arg.startswith('_')] @@ -488,30 +489,48 @@ def overwrite_args(default_args: argparse.Namespace, config_args: argparse.Names cli_val = getattr(cli_args, param, 'unspecified') config_val = getattr(config_args, param, 'unspecified') - if param in cli_args: - # param is defined in cli, cli val is used + if param in cli_args and param not in config_args: + # Use the value from the command line argument setattr(args, param, cli_val) - if default_val != cli_val: + if default_val != cli_val and param != "config": logging.getLogger("PPanGGOLiN").debug( - f'Parameter "--{param} {get_arg_name(cli_val)}" has been specified in command line.' - f' Its value overwrites putative config values.') + f'The parameter "--{param}: {get_arg_name(cli_val)}" has been specified in the command line with a non-default value.' + f' Its value overwrites the default value ({get_arg_name(default_val)}).') - elif param in config_args: - # parma is defined only in config. config val is used + elif param not in cli_args and param in config_args: + # Use the value from the config file setattr(args, param, config_val) - if default_val != config_val: + if default_val != config_args: + logging.getLogger("PPanGGOLiN").debug( + f'The parameter "--{param}: {get_arg_name(config_val)}" has been specified in the config file with a non-default value.' + f' Its value overwrites the default value ({get_arg_name(default_val)}).') + + elif param in cli_args and param in config_args: + # Use the value from the command line argument (cli) if it's different from the config file (config) + setattr(args, param, cli_val) + + if cli_val == config_val and cli_val != default_val: + logging.getLogger("PPanGGOLiN").debug( + f'The parameter "--{param} {get_arg_name(cli_val)}" has been specified in both the command line ' + f'and the config file with the same values, but with non-default value. ' + f'Its value overwrites the default value ({get_arg_name(default_val)}).') + + elif cli_val != config_val and param != "config": + # Values in cli and config differ. Use the value from the command line argument (cli) logging.getLogger("PPanGGOLiN").debug( - f'Parameter "{param}: {get_arg_name(config_val)}" has been specified in config file with non default value.' - f' Its value overwrites default value ({get_arg_name(default_val)}).') + f'The parameter "--{param}" has been specified in both the command line ("{get_arg_name(cli_val)}") ' + f'and the config file ("{get_arg_name(config_val)}") with different values. ' + f'The value from the command line argument is used.') else: - # param is not defined in cli and in config. default value is applied + # Parameter is not defined in cli and in config. Use the default value. setattr(args, param, default_val) return args + def combine_args(args: argparse.Namespace, another_args: argparse.Namespace): """ Combine two args object. @@ -667,41 +686,41 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ # Add args namespace of the step to the inital args namespace setattr(args, workflow_step, step_args) - # manage projection step parameters - elif subcommand == "projection" : + # # manage projection step parameters + # elif subcommand == "projection" : - projection_step = "annotate" - workflow_steps = [projection_step] - logging.getLogger().debug(f'Parsing {projection_step} arguments in config file.') - step_subparser = subcommand_to_subparser[projection_step] + # projection_step = "annotate" + # workflow_steps = [projection_step] + # logging.getLogger().debug(f'Parsing {projection_step} arguments in config file.') + # step_subparser = subcommand_to_subparser[projection_step] - default_step_args = get_default_args(projection_step, step_subparser, unwanted_args=all_unspecific_params) + # default_step_args = get_default_args(projection_step, step_subparser, unwanted_args=all_unspecific_params) - # remove general args - all_param_names = {arg_name for arg_name in dir(default_step_args) if not arg_name.startswith('_')} - specific_step_params = {param_name for param_name in all_param_names if - param_name not in all_unspecific_params} + # # remove general args + # all_param_names = {arg_name for arg_name in dir(default_step_args) if not arg_name.startswith('_')} + # specific_step_params = {param_name for param_name in all_param_names if + # param_name not in all_unspecific_params} - config_step_args = get_config_args(projection_step, step_subparser, config, projection_step, - specific_step_params, strict_config_check=True) + # config_step_args = get_config_args(projection_step, step_subparser, config, projection_step, + # specific_step_params, strict_config_check=True) - step_args = overwrite_args(default_step_args, config_step_args, cli_args) + # step_args = overwrite_args(default_step_args, config_step_args, cli_args) - step_params_that_differ = get_args_that_differe_from_default(default_step_args, step_args) + # step_params_that_differ = get_args_that_differe_from_default(default_step_args, step_args) - if step_params_that_differ: - step_params_that_differ_str = ', '.join([f'{p}={v}' for p, v in step_params_that_differ.items()]) - logging.getLogger().debug( - f"{len(step_params_that_differ)} {projection_step} parameters have a non-default value: {step_params_that_differ_str}") + # if step_params_that_differ: + # step_params_that_differ_str = ', '.join([f'{p}={v}' for p, v in step_params_that_differ.items()]) + # logging.getLogger().debug( + # f"{len(step_params_that_differ)} {projection_step} parameters have a non-default value: {step_params_that_differ_str}") - # add step name to differentiate the params - step_params_that_differ = {f'{projection_step}:{param}': value for param, value in - step_params_that_differ.items()} + # # add step name to differentiate the params + # step_params_that_differ = {f'{projection_step}:{param}': value for param, value in + # step_params_that_differ.items()} - params_that_differ.update(step_params_that_differ) + # params_that_differ.update(step_params_that_differ) - # Add args namespace of the step to the inital args namespace - setattr(args, projection_step, step_args) + # # Add args namespace of the step to the inital args namespace + # setattr(args, projection_step, step_args) if params_that_differ: @@ -903,8 +922,8 @@ def get_cli_args(subparser_fct: Callable) -> argparse.Namespace: # remove argument that have not been specified delete_unspecified_args(cli_args) delattr(cli_args, 'subcommand') - if 'config' in cli_args: - delattr(cli_args, 'config') + # if 'config' in cli_args: + # delattr(cli_args, 'config') return cli_args From 27d2a9f39eeed523eb8c32bdc0b236b3a004543e Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 31 Jul 2023 17:22:06 +0200 Subject: [PATCH 047/173] manage config and default annotate params when they are missing in pangenomes internal params --- ppanggolin/projection/projection.py | 110 +++++++++++++++++++--------- ppanggolin/utils.py | 39 +--------- 2 files changed, 78 insertions(+), 71 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 29ae1ca3..f6941f3c 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -9,17 +9,20 @@ import time from pathlib import Path import tempfile -from typing import Tuple, Set, Dict, Iterator +from typing import Tuple, Set, Dict, Iterator, Optional, List # installed libraries from tqdm import tqdm +from collections import defaultdict + # # local libraries from ppanggolin.annotate.synta import annotate_organism, read_fasta, get_dna_sequence from ppanggolin.annotate.annotate import read_anno_file +from ppanggolin.annotate import subparser as annotate_subparser from ppanggolin.pangenome import Pangenome from ppanggolin.cluster.cluster import infer_singletons # from ppanggolin.genome import input_organism, Gene, RNA, Contig -from ppanggolin.utils import read_compressed_or_not, write_compressed_or_not, mk_file_name, min_one, restricted_float, mk_outdir +from ppanggolin.utils import read_compressed_or_not, write_compressed_or_not, mk_file_name, min_one, restricted_float, mk_outdir, get_config_args, parse_config_file, get_default_args from ppanggolin.align.alignOnPang import get_seq2pang, project_and_write_partition from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations from ppanggolin.formats.readBinaries import get_pangenome_parameters, check_pangenome_info @@ -142,6 +145,69 @@ def retrieve_gene_sequences_from_fasta_file(input_organism, fasta_file): +def manage_annotate_param(annotate_param_names: List[str], pangenome_args: argparse.Namespace, + config_file: Optional[str]) -> argparse.Namespace: + """ + Manage annotate parameters by collecting them from different sources and merging them. + + :param annotate_param_names: List of annotate parameter names to be managed. + :param pangenome_args: Annotate arguments parsed from pangenomes parameters. + :param config_file: Path to the config file, can be None if not provided. + + :return: An argparse.Namespace containing the merged annotate parameters with their values. + """ + + default_annotate_args = get_default_args('annotate', annotate_subparser) + + if config_file is None: + config_annotate_args = argparse.Namespace() + else: + config = defaultdict(dict, parse_config_file(config_file)) + config_annotate_args = get_config_args('annotate', annotate_subparser, config, "annotate", annotate_param_names, strict_config_check=False) + + annotate_param_from_pangenome = {} + annotate_param_from_config = {} + annotate_param_from_default = {} + + annotate_params = argparse.Namespace() + + # Collecting annotate parameters from different sources + for annotate_arg in annotate_param_names: + if hasattr(pangenome_args, annotate_arg): + param_val = getattr(pangenome_args, annotate_arg) + annotate_param_from_pangenome[annotate_arg] = param_val + setattr(annotate_params, annotate_arg, param_val) + + elif hasattr(config_annotate_args, annotate_arg): + param_val = getattr(config_annotate_args, annotate_arg) + annotate_param_from_config[annotate_arg] = param_val + setattr(annotate_params, annotate_arg, param_val) + + else: + param_val = getattr(default_annotate_args, annotate_arg) + annotate_param_from_default[annotate_arg] = param_val + setattr(annotate_params, annotate_arg, param_val) + + # Log the sources of the annotate parameters + if len(annotate_param_from_pangenome) > 0: + param_val_string = ' '.join([f'--{k} {v}' for k, v in annotate_param_from_pangenome.items()]) + logging.getLogger("PPanGGOLiN").debug(f"{len(annotate_param_from_pangenome)}/{len(annotate_param_names)} annotate parameters extracted from pangenome parameters " + f"(the parameters used to build the input pangenome): {param_val_string}") + + if len(annotate_param_from_config) > 0: + param_val_string = ';'.join([f' {k} : {v}' for k, v in annotate_param_from_config.items()]) + logging.getLogger("PPanGGOLiN").debug(f"{len(annotate_param_from_config)}/{len(annotate_param_names)} annotate parameters were not found in pangenome internal parameters." + f" They have been parsed from the annotate section in the config file: {param_val_string}") + + if len(annotate_param_from_default) > 0: + param_val_string = ';'.join([f' {k} : {v}' for k, v in annotate_param_from_default.items()]) + logging.getLogger("PPanGGOLiN").debug(f"{len(annotate_param_from_default)}/{len(annotate_param_names)} annotate parameters were not found in the pangenome parameters " + f"nor in the config file. Default values have been used: {param_val_string}") + + return annotate_params + + + def launch(args: argparse.Namespace): """ @@ -178,11 +244,12 @@ def launch(args: argparse.Namespace): # pangenome_parameter = get_pangenome_parameters(h5f) if args.annot_file is not None: + # read_annotations(pangenome, args.anno, cpu=args.cpu, pseudo=args.use_pseudo, disable_bar=args.disable_prog_bar) input_organism, has_sequence = read_anno_file(organism_name = args.organism_name, filename=args.annot_file, circular_contigs=[], - pseudo=False) + pseudo=args.use_pseudo) if not has_sequence: if args.fasta_file: @@ -192,15 +259,19 @@ def launch(args: argparse.Namespace): "Thus, we do not have the information we need to continue the projection.") elif args.fasta_file is not None: + annotate_param_names = ["norna", "kingdom", "allow_overlap", "prodigal_procedure"] + + annotate_params = manage_annotate_param(annotate_param_names, pangenome_params.annotate, args.config) + + input_organism = annotate_organism(org_name=args.organism_name, file_name = args.fasta_file, circular_contigs=[], tmpdir=args.tmpdir, - code = args.annotate.translation_table, norna=args.annotate.norna, kingdom = args.annotate.kingdom, - overlap=args.annotate.allow_overlap, procedure=args.annotate.prodigal_procedure) + code = args.translation_table, norna=annotate_params.norna, kingdom = annotate_params.kingdom, + overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure) else: raise Exception("At least one of --fasta_file or --anno_file must be given") - # Add input organism in pangenome. This temporary as pangenome is not going to be written. pangenome.add_organism(input_organism) @@ -338,25 +409,7 @@ def parser_projection(parser: argparse.ArgumentParser): optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") - # optional.add_argument("--basename", required=False, default="pangenome", help="basename for the output file") - # annotate = parser.add_argument_group(title="Annotation arguments") - - optional.add_argument('--allow_overlap', required=False, action='store_true', default=False, - help="Use to not remove genes overlapping with RNA features.") - optional.add_argument("--norna", required=False, action="store_true", default=False, - help="Use to avoid annotating RNA features.") - optional.add_argument("--kingdom", required=False, type=str.lower, default="bacteria", - choices=["bacteria", "archaea"], - help="Kingdom to which the prokaryota belongs to, " - "to know which models to use for rRNA annotation.") - # optional.add_argument("--translation_table", required=False, type=int, default=11, - # help="Translation table (genetic code) to use.") - - optional.add_argument("--prodigal_procedure", required=False, type=str.lower, choices=["single", "meta"], - default=None, help="Allow to force the prodigal procedure. " - "If nothing given, PPanGGOLiN will decide in function of contig length") - optional.add_argument('--no_defrag', required=False, action="store_true", help="DO NOT Realign gene families to link fragments with" "their non-fragmented gene family. (default: False)") @@ -370,15 +423,6 @@ def parser_projection(parser: argparse.ArgumentParser): optional.add_argument("--translation_table", required=False, default="11", help="Translation table (genetic code) to use.") - # optional.add_argument("--getinfo", required=False, action="store_true", - # help="Use this option to extract info related to the best hit of each query, " - # "such as the RGP it is in, or the spots.") - - # optional.add_argument("--draw_related", required=False, action="store_true", - # help="Draw figures and provide graphs in a gexf format of the eventual spots" - # " associated to the input sequences") - - # but does not use the option optional.add_argument("--use_pseudo", required=False, action="store_true", help="In the context of provided annotation, use this option to read pseudogenes. " "(Default behavior is to ignore them)") diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index d44e00d6..2582d5f2 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -29,7 +29,7 @@ # all params that should be in the general_parameters section of the config file ALL_GENERAL_PARAMS = ['output', 'basename', 'rarefaction', 'no_flat_files', 'tmpdir', 'verbose', 'log', - 'disable_prog_bar', 'force'] + 'disable_prog_bar', 'force', "config"] WORKFLOW_SUBCOMMANDS = {'all', 'workflow', 'panrgp', 'panModule'} @@ -686,43 +686,6 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ # Add args namespace of the step to the inital args namespace setattr(args, workflow_step, step_args) - # # manage projection step parameters - # elif subcommand == "projection" : - - # projection_step = "annotate" - # workflow_steps = [projection_step] - # logging.getLogger().debug(f'Parsing {projection_step} arguments in config file.') - # step_subparser = subcommand_to_subparser[projection_step] - - # default_step_args = get_default_args(projection_step, step_subparser, unwanted_args=all_unspecific_params) - - # # remove general args - # all_param_names = {arg_name for arg_name in dir(default_step_args) if not arg_name.startswith('_')} - # specific_step_params = {param_name for param_name in all_param_names if - # param_name not in all_unspecific_params} - - # config_step_args = get_config_args(projection_step, step_subparser, config, projection_step, - # specific_step_params, strict_config_check=True) - - # step_args = overwrite_args(default_step_args, config_step_args, cli_args) - - # step_params_that_differ = get_args_that_differe_from_default(default_step_args, step_args) - - # if step_params_that_differ: - # step_params_that_differ_str = ', '.join([f'{p}={v}' for p, v in step_params_that_differ.items()]) - # logging.getLogger().debug( - # f"{len(step_params_that_differ)} {projection_step} parameters have a non-default value: {step_params_that_differ_str}") - - # # add step name to differentiate the params - # step_params_that_differ = {f'{projection_step}:{param}': value for param, value in - # step_params_that_differ.items()} - - # params_that_differ.update(step_params_that_differ) - - # # Add args namespace of the step to the inital args namespace - # setattr(args, projection_step, step_args) - - if params_that_differ: logging.getLogger("PPanGGOLiN").info(f'{len(params_that_differ)} parameters have a non-default value.') From 92df2919b5ba12d9422315235e21f88f6c524639 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 1 Aug 2023 15:44:55 +0200 Subject: [PATCH 048/173] manage prodigal parameter when None in config --- ppanggolin/projection/projection.py | 1 - ppanggolin/utils.py | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index f6941f3c..a38248cb 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -307,7 +307,6 @@ def launch(args: argparse.Namespace): def predict_spots(rgps: list, multigenics: set, output: str, spot_graph: bool = False, overlapping_match: int = 2, set_size: int = 3, exact_match: int = 1): - """ Create a spot graph from pangenome RGP diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index 2582d5f2..5907961f 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -502,7 +502,7 @@ def overwrite_args(default_args: argparse.Namespace, config_args: argparse.Names # Use the value from the config file setattr(args, param, config_val) - if default_val != config_args: + if default_val != config_val: logging.getLogger("PPanGGOLiN").debug( f'The parameter "--{param}: {get_arg_name(config_val)}" has been specified in the config file with a non-default value.' f' Its value overwrites the default value ({get_arg_name(default_val)}).') @@ -745,10 +745,11 @@ def set_up_config_param_to_parser(config_param_val: dict) -> list: arguments_to_parse = [] for param, val in config_param_val.items(): - if type(val) == bool: + if type(val) == bool or val is None or val == "None": # param is a flag if val is True: arguments_to_parse.append(f"--{param}") + # if val is False or None we don't add id to the else: arguments_to_parse.append(f"--{param}") From ba8b22bd6b3427597e816677ec70c0b1d494e33b Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 3 Aug 2023 17:01:58 +0200 Subject: [PATCH 049/173] make more explicit loging --- ppanggolin/formats/writeBinaries.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index a6043f65..6b89b204 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -1015,14 +1015,14 @@ def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable h5f = tables.open_file(filename, "a") if pangenome.status["geneSequences"] == "Computed": - logging.getLogger("PPanGGOLiN").info("writing the protein coding gene dna sequences") + logging.getLogger("PPanGGOLiN").info("writing the protein coding gene dna sequences in pangenome...") write_gene_sequences(pangenome, h5f, disable_bar=disable_bar) pangenome.status["geneSequences"] = "Loaded" if pangenome.status["genesClustered"] == "Computed": - logging.getLogger("PPanGGOLiN").info("Writing gene families and gene associations...") + logging.getLogger("PPanGGOLiN").info("Writing gene families and gene associations in pangenome...") write_gene_families(pangenome, h5f, force, disable_bar=disable_bar) - logging.getLogger("PPanGGOLiN").info("Writing gene families information...") + logging.getLogger("PPanGGOLiN").info("Writing gene families information in pangenome...") write_gene_fam_info(pangenome, h5f, force, disable_bar=disable_bar) if pangenome.status["genomesAnnotated"] in ["Loaded", "inFile"] and \ pangenome.status["defragmented"] == "Computed": @@ -1031,7 +1031,7 @@ def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable update_gene_fragments(pangenome, h5f, disable_bar=disable_bar) pangenome.status["genesClustered"] = "Loaded" if pangenome.status["neighborsGraph"] == "Computed": - logging.getLogger("PPanGGOLiN").info("Writing the edges...") + logging.getLogger("PPanGGOLiN").info("Writing the edges of neighbors graph in pangenome...") write_graph(pangenome, h5f, force, disable_bar=disable_bar) pangenome.status["neighborsGraph"] = "Loaded" if pangenome.status["partitioned"] == "Computed" and \ @@ -1040,17 +1040,17 @@ def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable pangenome.status["partitioned"] = "Loaded" if pangenome.status['predictedRGP'] == "Computed": - logging.getLogger("PPanGGOLiN").info("Writing Regions of Genomic Plasticity...") + logging.getLogger("PPanGGOLiN").info("Writing Regions of Genomic Plasticity in pangenome...") write_rgp(pangenome, h5f, force, disable_bar=disable_bar) pangenome.status['predictedRGP'] = "Loaded" if pangenome.status["spots"] == "Computed": - logging.getLogger("PPanGGOLiN").info("Writing Spots of Insertion...") + logging.getLogger("PPanGGOLiN").info("Writing Spots of Insertion in pangenome...") write_spots(pangenome, h5f, force, disable_bar=disable_bar) pangenome.status['spots'] = "Loaded" if pangenome.status["modules"] == "Computed": - logging.getLogger("PPanGGOLiN").info("Writing Modules...") + logging.getLogger("PPanGGOLiN").info("Writing Modules in pangenome...") write_modules(pangenome, h5f, force, disable_bar=disable_bar) pangenome.status["modules"] = "Loaded" From 546e6c8faebef6e02d7871c5ca2408538be59803 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 3 Aug 2023 17:03:47 +0200 Subject: [PATCH 050/173] split make_spot_graph function in smaller fct --- ppanggolin/RGP/spot.py | 78 +++++++++++++++++++++++++++--------------- 1 file changed, 50 insertions(+), 28 deletions(-) diff --git a/ppanggolin/RGP/spot.py b/ppanggolin/RGP/spot.py index f54722da..64fbedd7 100644 --- a/ppanggolin/RGP/spot.py +++ b/ppanggolin/RGP/spot.py @@ -7,6 +7,7 @@ import time import os from pathlib import Path +from typing import List # installed libraries import networkx as nx @@ -70,20 +71,18 @@ def check_sim(pair_border1: list, pair_border2: list, overlapping_match: int = 2 return False -def make_spot_graph(rgps: list, multigenics: set, output: Path, spot_graph: bool = False, overlapping_match: int = 2, - set_size: int = 3, exact_match: int = 1) -> list: +def make_spot_graph(rgps: list, multigenics: set, overlapping_match: int = 2, + set_size: int = 3, exact_match: int = 1) -> nx.Graph: """ Create a spot graph from pangenome RGP :param rgps: list of pangenome RGP :param multigenics: pangenome graph multigenic persistent families - :param output: Output directory to save the spot graph - :param spot_graph: Writes gexf graph of pairs of blocks of single copy markers flanking RGPs from same hotspot :param overlapping_match: Number of missing persistent genes allowed when comparing flanking genes :param set_size: Number of single copy markers to use as flanking genes for RGP during hotspot computation :param exact_match: Number of perfectly matching flanking single copy markers required to associate RGPs - :return: list of computed spot + :return: spot graph """ def add_new_node(g: nx.Graph, region: Region, borders: list): @@ -128,24 +127,31 @@ def add_new_node(g: nx.Graph, region: Region, borders: list): if check_sim([node_obj_i["border0"], node_obj_i["border1"]], [node_obj_j["border0"], node_obj_j["border1"]], overlapping_match, set_size, exact_match): graph_spot.add_edge(nodei, nodej) - spots = [] - spot_id = 0 - for comp in nx.algorithms.components.connected_components(graph_spot): - curr_spot = Spot(spot_id) - spots.append(curr_spot) - for node in comp: - curr_spot.add_regions(graph_spot.nodes[node]["rgp"]) - spot_id += 1 - if spot_graph: + return graph_spot + +def write_spot_graph(graph_spot, outdir, graph_formats): + for node in graph_spot.nodes: - del graph_spot.nodes[node]["border0"] - del graph_spot.nodes[node]["border1"] - del graph_spot.nodes[node]["rgp"] - nx.readwrite.gexf.write_gexf(graph_spot, output / "spotGraph.gexf") - nx.readwrite.graphml.write_graphml(graph_spot, output / "spotGraph.graphml") - return spots + graph_spot.nodes[node]["border0"] = ';'.join([fam.name for fam in graph_spot.nodes[node]["border0"]]) + graph_spot.nodes[node]["border1"] = ';'.join([fam.name for fam in graph_spot.nodes[node]["border1"]]) + # del graph_spot.nodes[node]["border0"] + # del graph_spot.nodes[node]["border1"] + + graph_spot.nodes[node]["organisms"] = ';'.join({rgp.organism.name for rgp in graph_spot.nodes[node]["rgp"]}) + + graph_spot.nodes[node]["rgp"] = ';'.join([rgp.name for rgp in graph_spot.nodes[node]["rgp"]]) + + + if "gexf" in graph_formats: + outfile = outdir / "spotGraph.gexf" + logging.info(f'Writing spot graph in {outfile}') + nx.readwrite.gexf.write_gexf(graph_spot, outdir / "spotGraph.gexf") + if "graphml" in graph_formats: + outfile = outdir / "spotGraph.graphml" + logging.info(f'Writing spot graph in {outfile}') + nx.readwrite.graphml.write_graphml(graph_spot, outdir / "spotGraph.graphml") def check_pangenome_former_spots(pangenome: Pangenome, force: bool = False): @@ -162,14 +168,15 @@ def check_pangenome_former_spots(pangenome: Pangenome, force: bool = False): erase_pangenome(pangenome, spots=True) -def predict_hotspots(pangenome: Pangenome, output: Path, spot_graph: bool = False, overlapping_match: int = 2, +def predict_hotspots(pangenome: Pangenome, output: Path, spot_graph: bool = False, graph_formats: List[str] = ['gexf'], overlapping_match: int = 2, set_size: int = 3, exact_match: int = 1, force: bool = False, disable_bar: bool = False): """ Main function to predict hotspot :param pangenome: Blank pangenome object :param output: Output directory to save the spot graph - :param spot_graph: Writes gexf graph of pairs of blocks of single copy markers flanking RGPs from same hotspot + :param spot_graph: Writes graph of pairs of blocks of single copy markers flanking RGPs from same hotspot + :param graph_formats: Set of graph file formats to save the output :param overlapping_match: Number of missing persistent genes allowed when comparing flanking genes :param set_size: Number of single copy markers to use as flanking genes for RGP during hotspot computation :param exact_match: Number of perfectly matching flanking single copy markers required to associate RGPs @@ -195,9 +202,22 @@ def predict_hotspots(pangenome: Pangenome, output: Path, spot_graph: bool = Fals logging.getLogger("PPanGGOLiN").info("Detecting hotspots in the pangenome...") - # predict spots - spots = make_spot_graph(pangenome.regions, multigenics, output, spot_graph, overlapping_match, set_size, + # make spots + graph_spot = make_spot_graph(pangenome.regions, multigenics, overlapping_match, set_size, exact_match) + + spots = [] + for spot_id, comp in enumerate(nx.algorithms.components.connected_components(graph_spot)): + curr_spot = Spot(spot_id) + spots.append(curr_spot) + + for node in comp: + curr_spot.add_regions(graph_spot.nodes[node]["rgp"]) + if spot_graph: + graph_spot.nodes[node]["spot_id"] = f"spot_{spot_id}" + + if spot_graph: + write_spot_graph(graph_spot, output, graph_formats) if len(spots) == 0: logging.getLogger("PPanGGOLiN").warning("No spots were detected.") @@ -222,9 +242,10 @@ def launch(args: argparse.Namespace): pangenome.add_file(args.pangenome) if args.spot_graph: mk_outdir(args.output, args.force) - predict_hotspots(pangenome, args.output, force=args.force, spot_graph=args.spot_graph, + predict_hotspots(pangenome, args.output, force=args.force, + spot_graph=args.spot_graph, graph_formats=args.graph_formats, overlapping_match=args.overlapping_match, set_size=args.set_size, - exact_match=args.exact_match_size, disable_bar=args.disable_prog_bar) + exact_match=args.exact_match_size, disable_bar=args.disable_prog_bar, ) write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar) @@ -256,7 +277,7 @@ def parser_spot(parser: argparse.ArgumentParser): f"_PID{str(os.getpid())}"), help="Output directory") optional.add_argument("--spot_graph", required=False, action="store_true", - help="Writes a graph in .gexf format of pairs of blocks of single copy markers flanking RGPs," + help="Writes a graph of pairs of blocks of single copy markers flanking RGPs," " supposedly belonging to the same hotspot") optional.add_argument("--overlapping_match", required=False, type=int, default=2, help="The number of 'missing' persistent genes allowed when comparing flanking genes during " @@ -268,7 +289,8 @@ def parser_spot(parser: argparse.ArgumentParser): help="Number of perfectly matching flanking single copy markers required to associate RGPs " "during hotspot computation (Ex: If set to 1, two RGPs are in the same hotspot " "if both their 1st flanking genes are the same)") - + optional.add_argument('--graph_formats', required=False, type=str, choices=['gexf', "graphml"], nargs="+", + default=['gexf'], help="Format of the output graph.") if __name__ == '__main__': """To test local change and allow using debugger""" From eb8015de99c3507bb6eecf55997f4c51cdcdb0a9 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 7 Aug 2023 14:59:18 +0200 Subject: [PATCH 051/173] manage spot projection in input orga --- ppanggolin/RGP/spot.py | 65 +++++------ ppanggolin/annotate/annotate.py | 2 +- ppanggolin/projection/projection.py | 174 +++++++++++++++++++++++----- 3 files changed, 180 insertions(+), 61 deletions(-) diff --git a/ppanggolin/RGP/spot.py b/ppanggolin/RGP/spot.py index 64fbedd7..b1238238 100644 --- a/ppanggolin/RGP/spot.py +++ b/ppanggolin/RGP/spot.py @@ -70,6 +70,28 @@ def check_sim(pair_border1: list, pair_border2: list, overlapping_match: int = 2 return True return False +def add_new_node_in_spot_graph(g: nx.Graph, region: Region, borders: list) -> str: + """ + Add bordering region as node to graph + + :param g: spot graph + :param region: region in spot + :param borders: bordering families in spot + :return blocks: name of the node that has been added + """ + blocks = str(sorted([[gene.family.ID for gene in borders[0]], [gene.family.ID for gene in borders[1]]], + key=lambda x: x[0])) + g.add_node(blocks) + try: + g.nodes[blocks]["nb_rgp"] += 1 + g.nodes[blocks]["rgp"].add(region) + except KeyError: + g.nodes[blocks]["nb_rgp"] = 1 + g.nodes[blocks]["border1"] = [gene.family for gene in borders[1]] + g.nodes[blocks]["border0"] = [gene.family for gene in borders[0]] + g.nodes[blocks]["rgp"] = {region} + + return blocks def make_spot_graph(rgps: list, multigenics: set, overlapping_match: int = 2, set_size: int = 3, exact_match: int = 1) -> nx.Graph: @@ -85,26 +107,6 @@ def make_spot_graph(rgps: list, multigenics: set, overlapping_match: int = 2, :return: spot graph """ - def add_new_node(g: nx.Graph, region: Region, borders: list): - """ - Add bordering region as node to graph - - :param g: spot graph - :param region: region in spot - :param borders: bordering families in spot - """ - blocks = str(sorted([[gene.family.ID for gene in borders[0]], [gene.family.ID for gene in borders[1]]], - key=lambda x: x[0])) - g.add_node(blocks) - try: - g.nodes[blocks]["nb_rgp"] += 1 - g.nodes[blocks]["rgp"].add(region) - except KeyError: - g.nodes[blocks]["nb_rgp"] = 1 - g.nodes[blocks]["border1"] = [gene.family for gene in borders[1]] - g.nodes[blocks]["border0"] = [gene.family for gene in borders[0]] - g.nodes[blocks]["rgp"] = {region} - graph_spot = nx.Graph() lost = 0 used = 0 @@ -114,7 +116,7 @@ def add_new_node(g: nx.Graph, region: Region, borders: list): lost += 1 else: used += 1 - add_new_node(graph_spot, rgp, border) + add_new_node_in_spot_graph(graph_spot, rgp, border) logging.getLogger("PPanGGOLiN").info(f"{lost} RGPs were not used as they are on a contig border (or have " f"less than {set_size} persistent gene families until the contig border)") logging.getLogger("PPanGGOLiN").info(f"{used} RGPs are being used to predict spots of insertion") @@ -130,28 +132,23 @@ def add_new_node(g: nx.Graph, region: Region, borders: list): return graph_spot -def write_spot_graph(graph_spot, outdir, graph_formats): - +def write_spot_graph(graph_spot, outdir, graph_formats, file_basename="spotGraph"): for node in graph_spot.nodes: - graph_spot.nodes[node]["border0"] = ';'.join([fam.name for fam in graph_spot.nodes[node]["border0"]]) graph_spot.nodes[node]["border1"] = ';'.join([fam.name for fam in graph_spot.nodes[node]["border1"]]) - # del graph_spot.nodes[node]["border0"] - # del graph_spot.nodes[node]["border1"] graph_spot.nodes[node]["organisms"] = ';'.join({rgp.organism.name for rgp in graph_spot.nodes[node]["rgp"]}) - graph_spot.nodes[node]["rgp"] = ';'.join([rgp.name for rgp in graph_spot.nodes[node]["rgp"]]) - if "gexf" in graph_formats: - outfile = outdir / "spotGraph.gexf" - logging.info(f'Writing spot graph in {outfile}') - nx.readwrite.gexf.write_gexf(graph_spot, outdir / "spotGraph.gexf") + outfile = outdir / f"{file_basename}.gexf" + logging.getLogger("PPanGGOLiN").info(f'Writing spot graph in {outfile}') + nx.readwrite.gexf.write_gexf(graph_spot, outfile) + if "graphml" in graph_formats: - outfile = outdir / "spotGraph.graphml" - logging.info(f'Writing spot graph in {outfile}') - nx.readwrite.graphml.write_graphml(graph_spot, outdir / "spotGraph.graphml") + outfile = outdir / f"{file_basename}.graphml" + logging.getLogger("PPanGGOLiN").info(f'Writing spot graph in {outfile}') + nx.readwrite.graphml.write_graphml(graph_spot, outfile) def check_pangenome_former_spots(pangenome: Pangenome, force: bool = False): diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 1090f65d..561eb451 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -574,7 +574,7 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: pangenome.parameters["annotate"]["norna"] = norna pangenome.parameters["annotate"]["kingdom"] = kingdom pangenome.parameters["annotate"]["translation_table"] = translation_table - pangenome.parameters["annotate"]["prodigal_procedure"] = False if procedure is None else procedure + pangenome.parameters["annotate"]["prodigal_procedure"] = None if procedure is None else procedure pangenome.parameters["annotate"]["# read_annotations_from_file"] = False diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index a38248cb..65043b43 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -10,10 +10,12 @@ from pathlib import Path import tempfile from typing import Tuple, Set, Dict, Iterator, Optional, List -# installed libraries -from tqdm import tqdm +from itertools import combinations from collections import defaultdict +# installed libraries +from tqdm import tqdm +import networkx as nx # # local libraries from ppanggolin.annotate.synta import annotate_organism, read_fasta, get_dna_sequence @@ -28,10 +30,10 @@ from ppanggolin.formats.readBinaries import get_pangenome_parameters, check_pangenome_info # from ppanggolin.formats import write_pangenome from ppanggolin.RGP.genomicIsland import naming_scheme, compute_org_rgp -from ppanggolin.RGP.spot import make_spot_graph +from ppanggolin.RGP.spot import make_spot_graph, check_sim, add_new_node_in_spot_graph, write_spot_graph from ppanggolin.genome import Organism, Gene, RNA, Contig from ppanggolin.geneFamily import GeneFamily -from ppanggolin.region import Region +from ppanggolin.region import Region, Spot from ppanggolin.formats.writeFlat import write_flat_files def annotate_input_genes_with_pangenome_families(pangenome, input_organism, output, cpu, no_defrag, identity, coverage, tmpdir, @@ -41,6 +43,8 @@ def annotate_input_genes_with_pangenome_families(pangenome, input_organism, out """ + + seq_fasta_file = output / f"{input_organism.name}.fasta" with open(seq_fasta_file, "w") as fh_out_faa: @@ -263,7 +267,6 @@ def launch(args: argparse.Namespace): annotate_params = manage_annotate_param(annotate_param_names, pangenome_params.annotate, args.config) - input_organism = annotate_organism(org_name=args.organism_name, file_name = args.fasta_file, circular_contigs=[], tmpdir=args.tmpdir, code = args.translation_table, norna=annotate_params.norna, kingdom = annotate_params.kingdom, overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure) @@ -290,42 +293,161 @@ def launch(args: argparse.Namespace): min_length=pangenome_params.rgp.min_length, min_score=pangenome_params.rgp.min_score, multigenics=multigenics, disable_bar=args.disable_prog_bar) - all_rgps = list(input_org_rgps) + pangenome.regions + # all_rgps = list(input_org_rgps) + pangenome.regions write_predicted_regions(input_org_rgps, output=output_dir) - spots = predict_spots(all_rgps, multigenics, output=output_dir, spot_graph=False,#args.spot.spot_graph, - overlapping_match=pangenome_params.spot.overlapping_match, set_size=pangenome_params.spot.set_size, - exact_match=pangenome_params.spot.exact_match_size) + input_org_rgp_to_spots = predict_spots_in_input_organism(initial_spots=pangenome.spots, + initial_regions=pangenome.regions, + input_org_rgps=input_org_rgps, + multigenics=multigenics, output=output_dir, + write_graph_flag=True, graph_formats=['graphml'], + overlapping_match=pangenome_params.spot.overlapping_match, + set_size=pangenome_params.spot.set_size, + exact_match=pangenome_params.spot.exact_match_size) - - if project_modules: - projetc_and_write_modules(pangenome, input_organism, output_dir) + project_and_write_modules(pangenome, input_organism, output_dir) - # write_flat_files_for_input_genome(input_organism) +def check_spots_congruency(graph_spot: nx.Graph, spots: List[Spot]) -> None: + """ + Check congruency of spots in the spot graph with the original spots. -def predict_spots(rgps: list, multigenics: set, output: str, - spot_graph: bool = False, overlapping_match: int = 2, set_size: int = 3, exact_match: int = 1): + :param graph_spot: The spot graph containing the connected components representing the spots. + :param spots: List of original spots in the pangenome. + :return: None. """ - Create a spot graph from pangenome RGP + rgp_to_spot = {region: spot for spot in spots for region in spot.regions} + + spots = [] + for cc in nx.algorithms.components.connected_components(graph_spot): + # one connected component is a spot + regions_in_cc = set() + for node in cc: + regions_in_cc |= graph_spot.nodes[node]["rgp"] + + # check that region in cc are the regions of a spot + spot_in_cc = {rgp_to_spot[rgp] for rgp in regions_in_cc} + assert len(spot_in_cc) == 1, "More than one spot in a connected_components. Something went wrong when recomputing spots." + current_spot = spot_in_cc.pop() + # Add spot id to the graph + for node in cc: + graph_spot.nodes[node]["spot_id"] = f"{current_spot}" + - :param rgps: list of pangenome RGP - :param multigenics: pangenome graph multigenic persistent families - :param output: Output directory to save the spot graph - :param spot_graph: Writes gexf graph of pairs of blocks of single copy markers flanking RGPs from same hotspot - :param overlapping_match: Number of missing persistent genes allowed when comparing flanking genes - :param set_size: Number of single copy markers to use as flanking genes for RGP during hotspot computation - :param exact_match: Number of perfectly matching flanking single copy markers required to associate RGPs - :return: list of computed spots +def predict_spots_in_input_organism(initial_spots: List[Spot], initial_regions: List[Region], + input_org_rgps: List, multigenics: Set, output: str, + write_graph_flag: bool = False, graph_formats: List[str] = ['gexf'], + overlapping_match: int = 2, set_size: int = 3, exact_match: int = 1) -> Dict: + """ + Create a spot graph from pangenome RGP and predict spots for input organism RGPs. + + :param initial_spots: List of original spots in the pangenome. + :param initial_regions: List of original regions in the pangenome. + :param input_org_rgps: List of RGPs from the input organism to be associated with spots. + :param multigenics: Set of pangenome graph multigenic persistent families. + :param output: Output directory to save the spot graph. + :param write_graph_flag: If True, writes the spot graph in the specified formats. + :param graph_formats: List of graph formats to write (default is ['gexf']). + :param overlapping_match: Number of missing persistent genes allowed when comparing flanking genes. + :param set_size: Number of single copy markers to use as flanking genes for RGP during hotspot computation. + :param exact_match: Number of perfectly matching flanking single copy markers required to associate RGPs. + + :return: A dictionary mapping input organism RGPs to their predicted spots. """ - spots = make_spot_graph(rgps=rgps, multigenics=multigenics, output=output, spot_graph=True, + logging.getLogger("PPanGGOLiN").info(f"Rebuilding spot graph.") + graph_spot = make_spot_graph(rgps=initial_regions, multigenics=multigenics, overlapping_match=overlapping_match, set_size=set_size, exact_match=exact_match) + + original_nodes = set(graph_spot.nodes) + + # Check congruency with already computed spot and add spot id in node attributes + check_spots_congruency(graph_spot, initial_spots) + + + # Check which input RGP has a spot + lost = 0 + used = 0 + + input_org_node_to_rgps = defaultdict(set) + + for rgp in input_org_rgps: + border = rgp.get_bordering_genes(set_size, multigenics) + if len(border[0]) < set_size or len(border[1]) < set_size: + lost += 1 + else: + used += 1 + border_node = add_new_node_in_spot_graph(graph_spot, rgp, border) + input_org_node_to_rgps[border_node].add(rgp) + + if len(input_org_node_to_rgps) == 0: + logging.getLogger("PPanGGOLiN").info(f"No RGPs of the input organism will be associated with any spot of insertion " + "as they are on a contig border (or have " + f"less than {set_size} persistent gene families until the contig border). " + "Projection of spots stops here") + return {} + + # remove node that were already in the graph + new_nodes = set(input_org_node_to_rgps) - original_nodes + + logging.getLogger("PPanGGOLiN").info(f"{lost} RGPs of the input organism won't be associated with any spot of insertion " + "as they are on a contig border (or have " + f"less than {set_size} persistent gene families until the contig border)") + + logging.getLogger("PPanGGOLiN").info(f"{used} RGPs of the input organism will be associated to a spot of insertion") + + # add potential edges from new nodes to the rest of the nodes + all_nodes = list(graph_spot.nodes) + for nodei in new_nodes: + for nodej in all_nodes: + if nodei == nodej: + continue + node_obj_i = graph_spot.nodes[nodei] + node_obj_j = graph_spot.nodes[nodej] + if check_sim([node_obj_i["border0"], node_obj_i["border1"]], + [node_obj_j["border0"], node_obj_j["border1"]], + overlapping_match, set_size, exact_match): + graph_spot.add_edge(nodei, nodej) + + input_rgp_to_spots = {} + new_spot_id_counter = max((s.ID for s in initial_spots)) + 1 + # determine spot ids of the new nodes and by extension to their rgps + for comp in nx.algorithms.components.connected_components(graph_spot): + # in very rare case one cc can have several original spots + # that would mean a new nodes from the input organism have connected two old cc + # in this case we report the two spots in the output + spots_of_the_cc = {graph_spot.nodes[n]["spot_id"] for n in comp if 'spot_id' in graph_spot.nodes[n]} + if len(spots_of_the_cc) == 0: + # no spot associated with any node of the cc + # that means this cc is only composed of new nodes + # let's add a new spot id + spots_of_the_cc = {f"new_spot_{new_spot_id_counter}"} + new_spot_id_counter += 1 + elif len(spots_of_the_cc) > 1: + # more than one spot in the cc + logging.getLogger("PPanGGOLiN").info('Some RGPs of the input organism ' + f"are connected to {len(spots_of_the_cc)} original spots of the pangenome.") + + input_rgps_of_the_cc = set() + for node in comp: + if node in input_org_node_to_rgps: + input_rgps_of_the_cc |= input_org_node_to_rgps[node] + + if write_graph_flag: + graph_spot.nodes[node]["projected_spot_id"] = ';'.join(spots_of_the_cc) + graph_spot.nodes[node]["node_with_input_org_RGPs"] = True + + input_rgp_to_spots.update({rgp:spots_of_the_cc for rgp in input_rgps_of_the_cc}) + + if write_graph_flag: + write_spot_graph(graph_spot, output, graph_formats, file_basename='projected_spotGraph') + + return input_rgp_to_spots -def projetc_and_write_modules(pangenome, input_organism, output, compress=False): +def project_and_write_modules(pangenome, input_organism, output, compress=False): """ Write a tsv file providing association between modules and the input organism From 0a0f1753ae04b09f4200f3fcad4f967553570845 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 7 Aug 2023 18:16:10 +0200 Subject: [PATCH 052/173] fix spot id attribute in graph spot --- ppanggolin/RGP/spot.py | 4 +- ppanggolin/projection/projection.py | 120 +++++++++++++++++++--------- 2 files changed, 83 insertions(+), 41 deletions(-) diff --git a/ppanggolin/RGP/spot.py b/ppanggolin/RGP/spot.py index b1238238..617c6e64 100644 --- a/ppanggolin/RGP/spot.py +++ b/ppanggolin/RGP/spot.py @@ -90,7 +90,7 @@ def add_new_node_in_spot_graph(g: nx.Graph, region: Region, borders: list) -> st g.nodes[blocks]["border1"] = [gene.family for gene in borders[1]] g.nodes[blocks]["border0"] = [gene.family for gene in borders[0]] g.nodes[blocks]["rgp"] = {region} - + return blocks def make_spot_graph(rgps: list, multigenics: set, overlapping_match: int = 2, @@ -211,7 +211,7 @@ def predict_hotspots(pangenome: Pangenome, output: Path, spot_graph: bool = Fals for node in comp: curr_spot.add_regions(graph_spot.nodes[node]["rgp"]) if spot_graph: - graph_spot.nodes[node]["spot_id"] = f"spot_{spot_id}" + graph_spot.nodes[node]["spot_id"] = str(curr_spot) if spot_graph: write_spot_graph(graph_spot, output, graph_formats) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 65043b43..2b45c604 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -34,7 +34,19 @@ from ppanggolin.genome import Organism, Gene, RNA, Contig from ppanggolin.geneFamily import GeneFamily from ppanggolin.region import Region, Spot -from ppanggolin.formats.writeFlat import write_flat_files +from ppanggolin.formats.writeFlat import write_flat_files, spot2rgp, summarize_spots + + + +class NewSpot(Spot): + """ + This class represent a hotspot specifically + created for the projected genome. + """ + + def __str__(self): + return f'new_spot_{str(self.ID)}' + def annotate_input_genes_with_pangenome_families(pangenome, input_organism, output, cpu, no_defrag, identity, coverage, tmpdir, disable_bar, translation_table, ): @@ -101,11 +113,11 @@ def predict_RGP(pangenome: Pangenome, input_organism: Organism, persistent_penal rgps = compute_org_rgp(input_organism, multigenics, persistent_penalty, variable_gain, min_length, min_score, naming=name_scheme, disable_bar=disable_bar) - logging.getLogger().info(f"{len(rgps)} RGPs have been predicted the input genomes.") + logging.getLogger().info(f"{len(rgps)} RGPs have been predicted in the input genomes.") return rgps -def write_predicted_regions(regions : Set[Region], output:Path, compress=False): +def write_predicted_regions(regions : Set[Region], input_org_rgp_to_spots:Dict[Region, Set[Spot]], output:Path, compress=False): """ Write the file providing information about RGP content @@ -114,11 +126,15 @@ def write_predicted_regions(regions : Set[Region], output:Path, compress=False): """ fname = output / "plastic_regions.tsv" with write_compressed_or_not(fname, compress) as tab: - tab.write("region\torganism\tcontig\tstart\tstop\tgenes\tcontigBorder\twholeContig\n") + tab.write("region\torganism\tcontig\tstart\tstop\tgenes\tcontigBorder\twholeContig\tspot_id\n") regions = sorted(regions, key=lambda x: (x.organism.name, x.contig.name, x.start)) for region in regions: + + spots = input_org_rgp_to_spots.get(region, {"No_spot"}) + spots_str = ';'.join([str(spot) for spot in spots]) + tab.write('\t'.join(map(str, [region.name, region.organism, region.contig, region.start, region.stop, - len(region.genes), region.is_contig_border, region.is_whole_contig])) + "\n") + len(region.genes), region.is_contig_border, region.is_whole_contig, spots_str])) + "\n") def retrieve_gene_sequences_from_fasta_file(input_organism, fasta_file): @@ -220,6 +236,11 @@ def launch(args: argparse.Namespace): :param args: All arguments provide by user """ + + output_dir = Path(args.output) + mk_outdir(output_dir, args.force) + + # For the moment this element of the pangenome are predicted by default project_modules = True predict_rgp = True @@ -237,15 +258,11 @@ def launch(args: argparse.Namespace): logging.getLogger().info('Retrieving parameters from the provided pangenome file.') pangenome_params = argparse.Namespace(**{step:argparse.Namespace(**k_v) for step, k_v in pangenome.parameters.items()}) - output_dir = Path(args.output) - mk_outdir(output_dir, args.force) # TODO check that the provided input_organism name is not found in pangenome # if so add a warning or error - # TODO some params are no keep in pangenome... like use_pseudo. what to do? - # with tables.open_file(pangenome, "r+"): - # pangenome_parameter = get_pangenome_parameters(h5f) + if args.annot_file is not None: @@ -275,7 +292,7 @@ def launch(args: argparse.Namespace): raise Exception("At least one of --fasta_file or --anno_file must be given") - # Add input organism in pangenome. This temporary as pangenome is not going to be written. + # Add input organism in pangenome. This is temporary as the pangenome object is not going to be written. pangenome.add_organism(input_organism) annotate_input_genes_with_pangenome_families(pangenome, input_organism=input_organism, output=output_dir, cpu=args.cpu, @@ -293,21 +310,23 @@ def launch(args: argparse.Namespace): min_length=pangenome_params.rgp.min_length, min_score=pangenome_params.rgp.min_score, multigenics=multigenics, disable_bar=args.disable_prog_bar) - # all_rgps = list(input_org_rgps) + pangenome.regions - - write_predicted_regions(input_org_rgps, output=output_dir) - - input_org_rgp_to_spots = predict_spots_in_input_organism(initial_spots=pangenome.spots, - initial_regions=pangenome.regions, - input_org_rgps=input_org_rgps, - multigenics=multigenics, output=output_dir, - write_graph_flag=True, graph_formats=['graphml'], - overlapping_match=pangenome_params.spot.overlapping_match, - set_size=pangenome_params.spot.set_size, - exact_match=pangenome_params.spot.exact_match_size) + if len(input_org_rgps) > 0: + input_org_rgp_to_spots = predict_spots_in_input_organism(initial_spots=pangenome.spots, + initial_regions=pangenome.regions, + input_org_rgps=input_org_rgps, + multigenics=multigenics, output=output_dir, + write_graph_flag=args.spot_graph, graph_formats=args.graph_formats, + overlapping_match=pangenome_params.spot.overlapping_match, + set_size=pangenome_params.spot.set_size, + exact_match=pangenome_params.spot.exact_match_size) + + write_predicted_regions(input_org_rgps, input_org_rgp_to_spots, output=output_dir, compress=False) + else: + logging.getLogger('PPanGGOLiN').info('No RGPs have been predicted in the input genomes. Spot prediction and RGP output are skipped.') + project_and_write_modules(pangenome, input_organism, output_dir) - + def check_spots_congruency(graph_spot: nx.Graph, spots: List[Spot]) -> None: """ @@ -332,7 +351,8 @@ def check_spots_congruency(graph_spot: nx.Graph, spots: List[Spot]) -> None: current_spot = spot_in_cc.pop() # Add spot id to the graph for node in cc: - graph_spot.nodes[node]["spot_id"] = f"{current_spot}" + graph_spot.nodes[node]["spot_id"] = str(current_spot) + graph_spot.nodes[node]["spots"] = {current_spot} @@ -412,19 +432,27 @@ def predict_spots_in_input_organism(initial_spots: List[Spot], initial_regions: graph_spot.add_edge(nodei, nodej) input_rgp_to_spots = {} + new_spots = [] new_spot_id_counter = max((s.ID for s in initial_spots)) + 1 # determine spot ids of the new nodes and by extension to their rgps for comp in nx.algorithms.components.connected_components(graph_spot): # in very rare case one cc can have several original spots # that would mean a new nodes from the input organism have connected two old cc # in this case we report the two spots in the output - spots_of_the_cc = {graph_spot.nodes[n]["spot_id"] for n in comp if 'spot_id' in graph_spot.nodes[n]} + spots_of_the_cc = set() + for node in comp: + if "spots" in graph_spot.nodes[node]: + spots_of_the_cc |= {spot for spot in graph_spot.nodes[node]["spots"]} + if len(spots_of_the_cc) == 0: # no spot associated with any node of the cc # that means this cc is only composed of new nodes # let's add a new spot id - spots_of_the_cc = {f"new_spot_{new_spot_id_counter}"} + new_spot = NewSpot(new_spot_id_counter) + new_spots.append(new_spot) + spots_of_the_cc = {new_spot} # {f"new_spot_{new_spot_id_counter}"} new_spot_id_counter += 1 + elif len(spots_of_the_cc) > 1: # more than one spot in the cc logging.getLogger("PPanGGOLiN").info('Some RGPs of the input organism ' @@ -434,16 +462,31 @@ def predict_spots_in_input_organism(initial_spots: List[Spot], initial_regions: for node in comp: if node in input_org_node_to_rgps: input_rgps_of_the_cc |= input_org_node_to_rgps[node] - + if write_graph_flag: - graph_spot.nodes[node]["projected_spot_id"] = ';'.join(spots_of_the_cc) - graph_spot.nodes[node]["node_with_input_org_RGPs"] = True - + graph_spot.nodes[node]["spots"] = spots_of_the_cc + + graph_spot.nodes[node]["spot_id"] = ';'.join((str(spot) for spot in spots_of_the_cc)) + graph_spot.nodes[node]["includes_RGPs_from_the_input_organism"] = True + + for spot in spots_of_the_cc: + spot.add_regions(input_rgps_of_the_cc) + input_rgp_to_spots.update({rgp:spots_of_the_cc for rgp in input_rgps_of_the_cc}) if write_graph_flag: + for node in graph_spot.nodes: + del graph_spot.nodes[node]["spots"] + write_spot_graph(graph_spot, output, graph_formats, file_basename='projected_spotGraph') + + spot_with_new_rgp = {spot for spots in input_rgp_to_spots.values() for spot in spots} + + spot2rgp(spot_with_new_rgp, output=output, compress = False) + + summarize_spots(spot_with_new_rgp, output, compress = False) + return input_rgp_to_spots @@ -521,13 +564,6 @@ def parser_projection(parser: argparse.ArgumentParser): time.localtime()) + "_PID" + str(os.getpid()), help="Output directory") - # optional.add_argument('--rgp', required=False, action='store_true', default=False, - # help="Predict RGPs and hot spots on the input genome.") - # optional.add_argument('--module', required=False, action='store_true', default=False, - # help="Project pangenome modules to the input genome.") - # optional.add_argument('--spots', required=False, action='store_true', default=False, - # help="Project pangenome spots to the input genome.") - optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") @@ -548,5 +584,11 @@ def parser_projection(parser: argparse.ArgumentParser): help="In the context of provided annotation, use this option to read pseudogenes. " "(Default behavior is to ignore them)") + optional.add_argument("--spot_graph", required=False, action="store_true", + help="Write the spot graph to a file, with pairs of blocks of single copy markers flanking RGPs " + "as nodes. This graph can be used to visualize nodes that have RGPs from the input organism.") + + optional.add_argument('--graph_formats', required=False, type=str, choices=['gexf', "graphml"], nargs="+", + default=['gexf'], help="Format of the output graph.") + optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") - \ No newline at end of file From 75b6c8ca3477376c0a3bab70c2279b6f7ff85a71 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 7 Aug 2023 18:16:35 +0200 Subject: [PATCH 053/173] add projection cmd in github action --- .github/workflows/main.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f424b17b..f083980f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -117,3 +117,11 @@ jobs: cd testingDataset ppanggolin utils --default_config panrgp -o panrgp_default_config.yaml ppanggolin panrgp --anno organisms.gbff.list --cluster clusters.tsv -o test_config --config panrgp_default_config.yaml + cd - + - name: testing projection cmd + shell: bash -l {0} + run: | + cd testingDataset + ppanggolin projection --pangenome myannopang/pangenome.h5 -o projection_from_gbff --organism_name trotro --annot GBFF/GCF_000026905.1_ASM2690v1_genomic.gbff.gz --spot_graph + ppanggolin projection --pangenome myannopang/pangenome.h5 -o projection_from_fasta --organism_name trotro --fasta FASTA/GCF_000026905.1_ASM2690v1_genomic.fna.gz --spot_graph --graph_formats graphml + \ No newline at end of file From c8bc1064f9aefb6f28c7dce9ae289d4559c90752 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 7 Aug 2023 18:22:13 +0200 Subject: [PATCH 054/173] force cython < v3 in setup.py to prevent error in macOS --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c5b67109..ada32680 100755 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ url="https://github.com/labgem/PPanGGOLiN", description="Pangenome analysis suite", packages=setuptools.find_packages(), - setup_requires=["cython"], + setup_requires=["cython<3.0.0"], install_requires=[], package_data={'': ['rRNA_DB/*cm*']}, classifiers=["Environment :: Console", From 2e9fbe377aeb1ae214c0134bef390c9590c0ae33 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 8 Aug 2023 10:33:20 +0200 Subject: [PATCH 055/173] fix Path type in argument --- ppanggolin/meta/meta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/meta/meta.py b/ppanggolin/meta/meta.py index 06bbbe7f..28689d5b 100644 --- a/ppanggolin/meta/meta.py +++ b/ppanggolin/meta/meta.py @@ -165,7 +165,7 @@ def parser_meta(parser: argparse.ArgumentParser): """ required = parser.add_argument_group(title="Required arguments", description="All of the following arguments are required :") - required.add_argument('-p', '--pangenome', required=False, type=str, help="The pangenome .h5 file") + required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file") required.add_argument('-m', '--metadata', required=False, type=Path, nargs='?', help='Metadata in TSV file. See our github for more detail about format') required.add_argument("-s", "--source", required=False, type=str, nargs="?", From 58ded6bba9acb90ab29f319d38b3ab4ad162f510 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 8 Aug 2023 11:13:54 +0200 Subject: [PATCH 056/173] prevent creation of outdir in default config --- ppanggolin/utility/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ppanggolin/utility/utils.py b/ppanggolin/utility/utils.py index 0829a5ea..ff222f02 100644 --- a/ppanggolin/utility/utils.py +++ b/ppanggolin/utility/utils.py @@ -215,7 +215,6 @@ def launch_default_config(args: argparse.Namespace): arg_lines.append(f"\n{sub_command}:") arg_lines += get_default_argument_lines(specific_actions) - mk_outdir(args.output.parent, args.force) logging.getLogger("PPanGGOLiN").info(f'Writting default config in {args.output}') with open(args.output, 'w') as fl: fl.write('\n'.join(arg_lines) + '\n') From 924496726a0d527b7db52ecb455c6f0bcbfb9fcf Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 8 Aug 2023 11:57:33 +0200 Subject: [PATCH 057/173] fiw input pan in projection cmd --- .github/workflows/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f083980f..50a32c47 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -122,6 +122,6 @@ jobs: shell: bash -l {0} run: | cd testingDataset - ppanggolin projection --pangenome myannopang/pangenome.h5 -o projection_from_gbff --organism_name trotro --annot GBFF/GCF_000026905.1_ASM2690v1_genomic.gbff.gz --spot_graph - ppanggolin projection --pangenome myannopang/pangenome.h5 -o projection_from_fasta --organism_name trotro --fasta FASTA/GCF_000026905.1_ASM2690v1_genomic.fna.gz --spot_graph --graph_formats graphml + ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_gbff --organism_name trotro --annot GBFF/GCF_000026905.1_ASM2690v1_genomic.gbff.gz --spot_graph + ppanggolin projection --stepbystep mybasicpangenome/pangenome.h5 -o projection_from_fasta --organism_name trotro --fasta FASTA/GCF_000026905.1_ASM2690v1_genomic.fna.gz --spot_graph --graph_formats graphml \ No newline at end of file From 4ea465aefc3f1fb5de34a985f2fef634b2fd75ae Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 8 Aug 2023 12:03:29 +0200 Subject: [PATCH 058/173] improve spots output --- ppanggolin/formats/writeFlat.py | 10 +++++++--- ppanggolin/projection/projection.py | 13 ++++++------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 181d8990..93941559 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -691,20 +691,24 @@ def write_regions(output: Path, compress: bool = False): len(region.genes), region.is_contig_border, region.is_whole_contig])) + "\n") -def summarize_spots(spots: set, output: Path, compress: bool = False): +def summarize_spots(spots: set, output: Path, compress: bool = False, file_name="summarize_spots.tsv"): """ Write a file providing summarize information about hotspots :param spots: set of spots in pangenome :param output: Path to output directory :param compress: Compress the file in .gz + :patam file_name: Name of the output file """ def r_and_s(value: float): """rounds to dp figures and returns a str of the provided value""" return str(round(value, 3)) if isinstance(value, float) else str(value) - with write_compressed_or_not(output / "summarize_spots.tsv", compress) as fout: + + file_path = output / file_name + + with write_compressed_or_not(file_path, compress) as fout: fout.write("spot\tnb_rgp\tnb_families\tnb_unique_family_sets\tmean_nb_genes\t" "stdev_nb_genes\tmax_nb_genes\tmin_nb_genes\n") for spot in sorted(spots, key=lambda x: len(x.regions), reverse=True): @@ -721,7 +725,7 @@ def r_and_s(value: float): min_size = min(size_list) fout.write("\t".join(map(r_and_s, [f"{spot.ID}", len(rgp_list), len(tot_fams), len_uniq_content, mean_size, stdev_size, max_size, min_size])) + "\n") - logging.getLogger("PPanGGOLiN").info(f"Done writing spots in : '{output.as_posix() + '/summarize_spots.tsv'}'") + logging.getLogger("PPanGGOLiN").info(f"Done writing spots in '{file_path}'") def spot2rgp(spots: set, output: Path, compress: bool = False): diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 2b45c604..b4086cfa 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -322,6 +322,12 @@ def launch(args: argparse.Namespace): exact_match=pangenome_params.spot.exact_match_size) write_predicted_regions(input_org_rgps, input_org_rgp_to_spots, output=output_dir, compress=False) + + new_spots = {spot for spots in input_org_rgp_to_spots.values() for spot in spots if type(spot) == NewSpot} + if new_spots: + logging.getLogger('PPanGGOLiN').info(f'{len(new_spots)} new spots have been created for the input genome.') + summarize_spots(new_spots, output_dir, compress = False, file_name="New_spots_summary.tsv") + else: logging.getLogger('PPanGGOLiN').info('No RGPs have been predicted in the input genomes. Spot prediction and RGP output are skipped.') @@ -480,13 +486,6 @@ def predict_spots_in_input_organism(initial_spots: List[Spot], initial_regions: write_spot_graph(graph_spot, output, graph_formats, file_basename='projected_spotGraph') - - spot_with_new_rgp = {spot for spots in input_rgp_to_spots.values() for spot in spots} - - spot2rgp(spot_with_new_rgp, output=output, compress = False) - - summarize_spots(spot_with_new_rgp, output, compress = False) - return input_rgp_to_spots From 05005004f6f94e033d49c0c6171330ac75bb8d52 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 8 Aug 2023 12:06:46 +0200 Subject: [PATCH 059/173] add missing docstring --- ppanggolin/projection/projection.py | 45 +++++++++++++++++------------ 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index b4086cfa..b5836a2c 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -46,48 +46,57 @@ class NewSpot(Spot): def __str__(self): return f'new_spot_{str(self.ID)}' +from pathlib import Path +import tempfile +import logging +from typing import Optional -def annotate_input_genes_with_pangenome_families(pangenome, input_organism, output, cpu, no_defrag, identity, coverage, tmpdir, - disable_bar, translation_table, ): - +def annotate_input_genes_with_pangenome_families(pangenome, input_organism, output: Path, cpu: int, no_defrag: bool, + identity: float, coverage: float, tmpdir: Path, + translation_table: int): """ + Annotate input genes with pangenome gene families and perform clustering. + + :param pangenome: Pangenome object. + :param input_organism: Input organism object. + :param output: Output directory for generated files. + :param cpu: Number of CPU cores to use. + :param no_defrag: Whether to use defragmentation. + :param identity: Minimum identity threshold for gene clustering. + :param coverage: Minimum coverage threshold for gene clustering. + :param tmpdir: Temporary directory for intermediate files. + :param disable_bar: Whether to disable progress bar. + :param translation_table: Translation table ID for nucleotide sequences. + :return: None """ - - seq_fasta_file = output / f"{input_organism.name}.fasta" with open(seq_fasta_file, "w") as fh_out_faa: - write_gene_sequences_from_annotations(input_organism.genes, fh_out_faa, - disable_bar=True) # this progress bar is useless here.. so I disable it. - # get corresponding gene families + write_gene_sequences_from_annotations(input_organism.genes, fh_out_faa, disable_bar=True) + with tempfile.TemporaryDirectory(dir=tmpdir, prefix="seq_to_pang_tmpdir_") as new_tmpdir: - seq_set, _, seqid_to_gene_family = get_seq2pang(pangenome, seq_fasta_file, output, Path(new_tmpdir), cpu, no_defrag, identity=identity, - coverage=coverage, is_nucleotide=True, translation_table=translation_table) + seq_set, _, seqid_to_gene_family = get_seq2pang(pangenome, seq_fasta_file, output, Path(new_tmpdir), + cpu, no_defrag, identity=identity, coverage=coverage, + is_nucleotide=True, translation_table=translation_table) - # this function only write the seqid and partition associated in a file project_and_write_partition(seqid_to_gene_family, seq_set, output) - # Add gene of the input organism in the associated gene family - # when a gene is not associated with any gene family, a new family is created. lonely_gene = 0 for gene in input_organism.genes: try: gene_family = seqid_to_gene_family[gene.ID] gene_family.add_gene(gene) - except KeyError: - # add a new gene family new_gene_family = pangenome.add_gene_family(gene.ID) new_gene_family.add_gene(gene) new_gene_family.add_partition("Cloud") lonely_gene += 1 - logging.getLogger().info(f"The input organisms have {lonely_gene}/{input_organism.number_of_genes()} " - "genes that do not cluster with any of the gene families of the pangenome.") - + logging.getLogger().info(f"The input organism has {lonely_gene}/{input_organism.number_of_genes()} " + "genes that do not cluster with any of the gene families in the pangenome.") def predict_RGP(pangenome: Pangenome, input_organism: Organism, persistent_penalty: int, variable_gain: int, min_length: int, min_score: int, multigenics: float, From 8938b0e0de0e1faf022b08081d0e9aa814a1629f Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 8 Aug 2023 15:06:56 +0200 Subject: [PATCH 060/173] change output name --- ppanggolin/projection/projection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index b5836a2c..76b19843 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -335,7 +335,7 @@ def launch(args: argparse.Namespace): new_spots = {spot for spots in input_org_rgp_to_spots.values() for spot in spots if type(spot) == NewSpot} if new_spots: logging.getLogger('PPanGGOLiN').info(f'{len(new_spots)} new spots have been created for the input genome.') - summarize_spots(new_spots, output_dir, compress = False, file_name="New_spots_summary.tsv") + summarize_spots(new_spots, output_dir, compress = False, file_name="new_spots_summary.tsv") else: logging.getLogger('PPanGGOLiN').info('No RGPs have been predicted in the input genomes. Spot prediction and RGP output are skipped.') From bf9b2940f476f332079ef6dc6fa9ef10359d638b Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 8 Aug 2023 15:24:45 +0200 Subject: [PATCH 061/173] fix small bugs --- .github/workflows/main.yml | 4 ++-- ppanggolin/projection/projection.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 50a32c47..6a2acc8b 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -122,6 +122,6 @@ jobs: shell: bash -l {0} run: | cd testingDataset - ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_gbff --organism_name trotro --annot GBFF/GCF_000026905.1_ASM2690v1_genomic.gbff.gz --spot_graph - ppanggolin projection --stepbystep mybasicpangenome/pangenome.h5 -o projection_from_fasta --organism_name trotro --fasta FASTA/GCF_000026905.1_ASM2690v1_genomic.fna.gz --spot_graph --graph_formats graphml + ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_gbff --organism_name trotro --annot GBFF/GCF_000026905.1_ASM2690v1_genomic.gbff.gz --spot_graph + ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_fasta --organism_name trotro --fasta FASTA/GCF_000026905.1_ASM2690v1_genomic.fna.gz --spot_graph --graph_formats graphml \ No newline at end of file diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 76b19843..7ad8292d 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -306,7 +306,7 @@ def launch(args: argparse.Namespace): annotate_input_genes_with_pangenome_families(pangenome, input_organism=input_organism, output=output_dir, cpu=args.cpu, no_defrag = args.no_defrag, identity = args.identity, coverage = args.coverage, tmpdir=args.tmpdir, - disable_bar=args.disable_prog_bar, translation_table = args.translation_table) + translation_table = args.translation_table) if predict_rgp: logging.getLogger().info('Detecting rgp in input genome.') @@ -567,22 +567,22 @@ def parser_projection(parser: argparse.ArgumentParser): optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument('-o', '--output', required=False, type=str, + optional.add_argument('-o', '--output', required=False, type=Path, default="ppanggolin_output" + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", time.localtime()) + "_PID" + str(os.getpid()), help="Output directory") - optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), + optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") optional.add_argument('--no_defrag', required=False, action="store_true", help="DO NOT Realign gene families to link fragments with" "their non-fragmented gene family. (default: False)") - optional.add_argument('--identity', required=False, type=float, default=0.5, + optional.add_argument('--identity', required=False, type=restricted_float, default=0.5, help="min identity percentage threshold") - optional.add_argument('--coverage', required=False, type=float, default=0.8, + optional.add_argument('--coverage', required=False, type=restricted_float, default=0.8, help="min coverage percentage threshold") optional.add_argument("--translation_table", required=False, default="11", From b1fcfd31ab7a42b6efde1e497c47ba726947f3bd Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 8 Aug 2023 15:31:45 +0200 Subject: [PATCH 062/173] fix argument type --- ppanggolin/projection/projection.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 7ad8292d..e9a6d280 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -10,7 +10,6 @@ from pathlib import Path import tempfile from typing import Tuple, Set, Dict, Iterator, Optional, List -from itertools import combinations from collections import defaultdict # installed libraries @@ -22,19 +21,18 @@ from ppanggolin.annotate.annotate import read_anno_file from ppanggolin.annotate import subparser as annotate_subparser from ppanggolin.pangenome import Pangenome -from ppanggolin.cluster.cluster import infer_singletons # from ppanggolin.genome import input_organism, Gene, RNA, Contig -from ppanggolin.utils import read_compressed_or_not, write_compressed_or_not, mk_file_name, min_one, restricted_float, mk_outdir, get_config_args, parse_config_file, get_default_args +from ppanggolin.utils import read_compressed_or_not, write_compressed_or_not, restricted_float, mk_outdir, get_config_args, parse_config_file, get_default_args from ppanggolin.align.alignOnPang import get_seq2pang, project_and_write_partition from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations -from ppanggolin.formats.readBinaries import get_pangenome_parameters, check_pangenome_info +from ppanggolin.formats.readBinaries import check_pangenome_info # from ppanggolin.formats import write_pangenome from ppanggolin.RGP.genomicIsland import naming_scheme, compute_org_rgp from ppanggolin.RGP.spot import make_spot_graph, check_sim, add_new_node_in_spot_graph, write_spot_graph from ppanggolin.genome import Organism, Gene, RNA, Contig from ppanggolin.geneFamily import GeneFamily from ppanggolin.region import Region, Spot -from ppanggolin.formats.writeFlat import write_flat_files, spot2rgp, summarize_spots +from ppanggolin.formats.writeFlat import summarize_spots @@ -52,7 +50,7 @@ def __str__(self): from typing import Optional -def annotate_input_genes_with_pangenome_families(pangenome, input_organism, output: Path, cpu: int, no_defrag: bool, +def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_organism: Organism, output: Path, cpu: int, no_defrag: bool, identity: float, coverage: float, tmpdir: Path, translation_table: int): """ @@ -372,7 +370,7 @@ def check_spots_congruency(graph_spot: nx.Graph, spots: List[Spot]) -> None: def predict_spots_in_input_organism(initial_spots: List[Spot], initial_regions: List[Region], - input_org_rgps: List, multigenics: Set, output: str, + input_org_rgps: List[Region], multigenics: Set[GeneFamily], output: str, write_graph_flag: bool = False, graph_formats: List[str] = ['gexf'], overlapping_match: int = 2, set_size: int = 3, exact_match: int = 1) -> Dict: """ @@ -498,7 +496,7 @@ def predict_spots_in_input_organism(initial_spots: List[Spot], initial_regions: return input_rgp_to_spots -def project_and_write_modules(pangenome, input_organism, output, compress=False): +def project_and_write_modules(pangenome:Pangenome, input_organism: Organism, output:Path, compress:bool=False): """ Write a tsv file providing association between modules and the input organism From b0960731871f13e601940cf6bdac88f2ce2274a5 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 9 Aug 2023 17:31:44 +0200 Subject: [PATCH 063/173] Add check of whats in the pan and project accordingly --- ppanggolin/projection/projection.py | 158 +++++++++++++++++++--------- 1 file changed, 111 insertions(+), 47 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index e9a6d280..89eb8ec5 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -11,6 +11,7 @@ import tempfile from typing import Tuple, Set, Dict, Iterator, Optional, List from collections import defaultdict +import csv # installed libraries from tqdm import tqdm @@ -44,10 +45,6 @@ class NewSpot(Spot): def __str__(self): return f'new_spot_{str(self.ID)}' -from pathlib import Path -import tempfile -import logging -from typing import Optional def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_organism: Organism, output: Path, cpu: int, no_defrag: bool, @@ -124,25 +121,65 @@ def predict_RGP(pangenome: Pangenome, input_organism: Organism, persistent_penal return rgps -def write_predicted_regions(regions : Set[Region], input_org_rgp_to_spots:Dict[Region, Set[Spot]], output:Path, compress=False): +def write_predicted_regions(regions: Set[Region], + output: Path, compress: bool = False): """ - Write the file providing information about RGP content + Write the file providing information about predicted regions. - :param output: Path to output directory - :param compress: Compress the file in .gz + :param regions: Set of Region objects representing predicted regions. + :param output: Path to the output directory. + :param compress: Whether to compress the file in .gz format. """ fname = output / "plastic_regions.tsv" with write_compressed_or_not(fname, compress) as tab: - tab.write("region\torganism\tcontig\tstart\tstop\tgenes\tcontigBorder\twholeContig\tspot_id\n") + fieldnames = ["region", "organism", "contig", "start", "stop", "genes", "contigBorder", "wholeContig"] + + writer = csv.DictWriter(tab, fieldnames=fieldnames, delimiter='\t') + writer.writeheader() + regions = sorted(regions, key=lambda x: (x.organism.name, x.contig.name, x.start)) for region in regions: + row = { + "region": region.name, + "organism": region.organism, + "contig": region.contig, + "start": region.start, + "stop": region.stop, + "genes": len(region.genes), + "contigBorder": region.is_contig_border, + "wholeContig": region.is_whole_contig + } - spots = input_org_rgp_to_spots.get(region, {"No_spot"}) - spots_str = ';'.join([str(spot) for spot in spots]) + writer.writerow(row) + + +def write_rgp_to_spot_table(rgp_to_spots: Dict[Region, Set[str]], output: Path, filename: str, compress: bool = False): + """ + Write a table mapping RGPs to corresponding spot IDs. + + :param rgp_to_spots: A dictionary mapping RGPs to spot IDs. + :param output: Path to the output directory. + :param filename: Name of the file to write. + :param compress: Whether to compress the file. + """ + fname = output / filename + logging.getLogger('PPanGGOLiN').info(f'Writing RGPs to spot table in {fname}') + + with write_compressed_or_not(fname, compress) as tab: + fieldnames = ["region", "spot_id"] + + writer = csv.DictWriter(tab, fieldnames=fieldnames, delimiter='\t') + writer.writeheader() + + regions = sorted(rgp_to_spots.keys(), key=lambda x: (x.organism.name, x.contig.name, x.start)) + for region in regions: + row = { + "region": region.name, + "spot_id": ';'.join(map(str, rgp_to_spots[region])) + } + + writer.writerow(row) - tab.write('\t'.join(map(str, [region.name, region.organism, region.contig, region.start, region.stop, - len(region.genes), region.is_contig_border, region.is_whole_contig, spots_str])) + "\n") - def retrieve_gene_sequences_from_fasta_file(input_organism, fasta_file): """ @@ -252,12 +289,33 @@ def launch(args: argparse.Namespace): project_modules = True predict_rgp = True project_spots = True - # TODO : check that the different elements have been predicted in the pangenome. if not need to define a behavior... - # load pangenome pangenome = Pangenome() pangenome.add_file(args.pangenome) + if pangenome.status["partitioned"] not in ["Computed", "Loaded", "inFile"]: + raise NameError(f"The provided pangenome has not been partitioned. " + "Annotation of an external genome is therefore not possible. " + "See the 'partition' subcommands.") + + if pangenome.status["predictedRGP"] not in ["Computed", "Loaded", "inFile"]: + logging.getLogger().info("RGPs have not been predicted in the provided pangenome. " + "Projection of RGPs into the provided genome will not be performed.") + predict_rgp = False + + if pangenome.status["spots"] not in ["Computed", "Loaded", "inFile"]: + logging.getLogger().info("Spots have not been predicted in the provided pangenome. " + "Projection of spots into the provided genome will not be performed.") + project_spots = False + + + if pangenome.status["modules"] not in ["Computed", "Loaded", "inFile"]: + logging.getLogger().info("Modules have not been predicted in the provided pangenome. " + "Projection of modules into the provided genome will not be performed.") + + project_modules = False + + check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar, need_rgp=predict_rgp, need_modules=project_modules, need_spots=project_spots) @@ -266,13 +324,10 @@ def launch(args: argparse.Namespace): pangenome_params = argparse.Namespace(**{step:argparse.Namespace(**k_v) for step, k_v in pangenome.parameters.items()}) - # TODO check that the provided input_organism name is not found in pangenome - # if so add a warning or error - - + if args.organism_name in [org.name for org in pangenome.organisms]: + raise NameError(f"The provided organism name '{args.organism_name}' already exists in the given pangenome.") if args.annot_file is not None: - # read_annotations(pangenome, args.anno, cpu=args.cpu, pseudo=args.use_pseudo, disable_bar=args.disable_prog_bar) input_organism, has_sequence = read_anno_file(organism_name = args.organism_name, filename=args.annot_file, @@ -316,28 +371,29 @@ def launch(args: argparse.Namespace): input_org_rgps = predict_RGP(pangenome, input_organism, persistent_penalty=pangenome_params.rgp.persistent_penalty, variable_gain=pangenome_params.rgp.variable_gain, min_length=pangenome_params.rgp.min_length, min_score=pangenome_params.rgp.min_score, multigenics=multigenics, disable_bar=args.disable_prog_bar) - - - if len(input_org_rgps) > 0: - input_org_rgp_to_spots = predict_spots_in_input_organism(initial_spots=pangenome.spots, - initial_regions=pangenome.regions, - input_org_rgps=input_org_rgps, - multigenics=multigenics, output=output_dir, - write_graph_flag=args.spot_graph, graph_formats=args.graph_formats, - overlapping_match=pangenome_params.spot.overlapping_match, - set_size=pangenome_params.spot.set_size, - exact_match=pangenome_params.spot.exact_match_size) + if len(input_org_rgps) == 0: + + logging.getLogger('PPanGGOLiN').info("No RGPs have been found in the input organisms. " + "As a result, spot prediction and RGP output will be skipped.") - write_predicted_regions(input_org_rgps, input_org_rgp_to_spots, output=output_dir, compress=False) - - new_spots = {spot for spots in input_org_rgp_to_spots.values() for spot in spots if type(spot) == NewSpot} - if new_spots: - logging.getLogger('PPanGGOLiN').info(f'{len(new_spots)} new spots have been created for the input genome.') - summarize_spots(new_spots, output_dir, compress = False, file_name="new_spots_summary.tsv") else: - logging.getLogger('PPanGGOLiN').info('No RGPs have been predicted in the input genomes. Spot prediction and RGP output are skipped.') + write_predicted_regions(input_org_rgps, output=output_dir, compress=False) + + if project_spots and len(input_org_rgps) > 0: + predict_spots_in_input_organism(initial_spots=pangenome.spots, + initial_regions=pangenome.regions, + input_org_rgps=input_org_rgps, + multigenics=multigenics, output=output_dir, + write_graph_flag=args.spot_graph, graph_formats=args.graph_formats, + overlapping_match=pangenome_params.spot.overlapping_match, + set_size=pangenome_params.spot.set_size, + exact_match=pangenome_params.spot.exact_match_size) + + + + project_and_write_modules(pangenome, input_organism, output_dir) @@ -493,6 +549,17 @@ def predict_spots_in_input_organism(initial_spots: List[Spot], initial_regions: write_spot_graph(graph_spot, output, graph_formats, file_basename='projected_spotGraph') + + write_rgp_to_spot_table(input_rgp_to_spots, output=output, filename='input_organism_rgp_to_spot.tsv') + + + new_spots = {spot for spots in input_rgp_to_spots.values() for spot in spots if type(spot) == NewSpot} + + if new_spots: + logging.getLogger('PPanGGOLiN').info(f'{len(new_spots)} new spots have been created for the input genome.') + summarize_spots(new_spots, output, compress = False, file_name="new_spots_summary.tsv") + + return input_rgp_to_spots @@ -506,8 +573,6 @@ def project_and_write_modules(pangenome:Pangenome, input_organism: Organism, out output_file = output / "modules_in_input_organism.tsv" - logging.getLogger().info("Writing modules to organisms associations...") - input_organism_families = input_organism.families counter = 0 with write_compressed_or_not(output_file, compress) as fout: @@ -525,7 +590,7 @@ def project_and_write_modules(pangenome:Pangenome, input_organism: Organism, out logging.getLogger().info(f"{counter} modules have been projected to the input genomes.") logging.getLogger().info( - f"Writing projected modules to input organism : '{output_file}'") + f"Projected modules have been written in: '{output_file}'") def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -551,20 +616,19 @@ def parser_projection(parser: argparse.ArgumentParser): description="One of the following arguments is required :") required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome.h5 file") - required.add_argument('--organism_name', required=False, type=str, + required.add_argument("-n", '--organism_name', required=False, type=str, help="Name of the input_organism whose genome is being annotated with the provided pangenome.") required.add_argument('--fasta_file', required=False, type=Path, - help="The filepath of the genomic sequence(s) in FASTA format for the projected genome. " + help="The filepath of the genomic sequence(s) in FASTA format if the genome to annotate. " "(Fasta file can be compressed with gzip)") required.add_argument('--annot_file', required=False, type=Path, - help="The filepath of the annotations in GFF/GBFF format for the projected genome. " + help="The filepath of the annotations in GFF/GBFF format for the genome to annotate with the provided pangenome. " "(Annotation file can be compressed with gzip)") - - optional = parser.add_argument_group(title="Optional arguments") + optional.add_argument('-o', '--output', required=False, type=Path, default="ppanggolin_output" + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", time.localtime()) + "_PID" + str(os.getpid()), From 24c450dbb4b2eb56c6b3f436f7b36d6d66f2a274 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 9 Aug 2023 17:33:16 +0200 Subject: [PATCH 064/173] Add check on the required input param of projection --- ppanggolin/main.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/ppanggolin/main.py b/ppanggolin/main.py index e96d9bf5..5dfbacf0 100644 --- a/ppanggolin/main.py +++ b/ppanggolin/main.py @@ -127,19 +127,33 @@ def cmd_line() -> argparse.Namespace: set_verbosity_level(args) if args.subcommand == "annotate" and args.fasta is None and args.anno is None: - parser.error("You must provide at least a file with the --fasta option to annotate from sequences, " - "or a file with the --gff option to load annotations through the command line or the config file.") + parser.error("Please provide either a sequence file using the --fasta option or an annotation file using the --anno option " + "to enable annotation. Use the command line or the config file.") cmds_pangenome_required = ["cluster", "info", "module", "graph", "align", "context", "write", "msa", "draw", "partition", "rarefaction", "spot", "fasta", "metrics", "rgp", "projection", "metadata"] if args.subcommand in cmds_pangenome_required and args.pangenome is None: - parser.error("You must provide a pangenome file with the --pangenome " - "argument through the command line or the config file.") + parser.error("Please specify a pangenome file using the --pangenome argument, " + "either through the command line or the config file.") + if args.subcommand == "align" and args.sequences is None: - parser.error("You must provide sequences (nucleotides or amino acids) to align on the pangenome gene families " - "with the --sequences argument through the command line or the config file.") + parser.error("Please provide sequences (nucleotides or amino acids) for alignment with the pangenome gene families " + "using the --sequences argument, either through the command line or the config file.") + + + if args.subcommand == "projection" and args.organism_name is None: + parser.error("Please specify the name of the input organism you want to annotate using the provided pangenome. " + "You can use the --organism_name argument either through the command line or the config file.") + + if args.subcommand == "projection" and args.fasta_file is None and args.annot_file is None: + parser.error("Please provide either a sequence file using the --fasta_file option or an annotation file (GFF/GBFF) " + "using the --annot_file option for the input organism, either through the command line or the config file, " + "to enable annotation with the provided pangenome.") + + + return args From 1b37535df8cd7ce51abed28fe1149c1ad55d7014 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 9 Aug 2023 17:33:56 +0200 Subject: [PATCH 065/173] Use a better genome in projection cmd --- .github/workflows/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6a2acc8b..a8652acd 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -72,7 +72,7 @@ jobs: ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta organisms.fasta.list ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --info_modules --no_print_info -f --log metrics.log - cd - + cd - - name: gbff parsing and MSA computing shell: bash -l {0} run: | @@ -123,5 +123,5 @@ jobs: run: | cd testingDataset ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_gbff --organism_name trotro --annot GBFF/GCF_000026905.1_ASM2690v1_genomic.gbff.gz --spot_graph - ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_fasta --organism_name trotro --fasta FASTA/GCF_000026905.1_ASM2690v1_genomic.fna.gz --spot_graph --graph_formats graphml + ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_fasta --organism_name trotro --fasta_file FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz --spot_graph --graph_formats graphml \ No newline at end of file From 2276f125dc8cfccf151a77db6b9b92cd90bf08f2 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 9 Aug 2023 17:38:41 +0200 Subject: [PATCH 066/173] apply autopep8 --- ppanggolin/projection/projection.py | 460 ++++++++++++++-------------- 1 file changed, 238 insertions(+), 222 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 89eb8ec5..51547ab3 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -36,7 +36,6 @@ from ppanggolin.formats.writeFlat import summarize_spots - class NewSpot(Spot): """ This class represent a hotspot specifically @@ -47,9 +46,130 @@ def __str__(self): return f'new_spot_{str(self.ID)}' +def launch(args: argparse.Namespace): + """ + Command launcher + + :param args: All arguments provide by user + """ + + output_dir = Path(args.output) + mk_outdir(output_dir, args.force) + + # For the moment this element of the pangenome are predicted by default + project_modules = True + predict_rgp = True + project_spots = True + + pangenome = Pangenome() + pangenome.add_file(args.pangenome) + + if pangenome.status["partitioned"] not in ["Computed", "Loaded", "inFile"]: + raise NameError(f"The provided pangenome has not been partitioned. " + "Annotation of an external genome is therefore not possible. " + "See the 'partition' subcommands.") + + if pangenome.status["predictedRGP"] not in ["Computed", "Loaded", "inFile"]: + logging.getLogger().info("RGPs have not been predicted in the provided pangenome. " + "Projection of RGPs into the provided genome will not be performed.") + predict_rgp = False + + if pangenome.status["spots"] not in ["Computed", "Loaded", "inFile"]: + logging.getLogger().info("Spots have not been predicted in the provided pangenome. " + "Projection of spots into the provided genome will not be performed.") + project_spots = False + + if pangenome.status["modules"] not in ["Computed", "Loaded", "inFile"]: + logging.getLogger().info("Modules have not been predicted in the provided pangenome. " + "Projection of modules into the provided genome will not be performed.") + + project_modules = False + + check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar, + need_rgp=predict_rgp, need_modules=project_modules, + need_spots=project_spots) + + logging.getLogger().info('Retrieving parameters from the provided pangenome file.') + pangenome_params = argparse.Namespace( + **{step: argparse.Namespace(**k_v) for step, k_v in pangenome.parameters.items()}) + + if args.organism_name in [org.name for org in pangenome.organisms]: + raise NameError( + f"The provided organism name '{args.organism_name}' already exists in the given pangenome.") + + if args.annot_file is not None: + # read_annotations(pangenome, args.anno, cpu=args.cpu, pseudo=args.use_pseudo, disable_bar=args.disable_prog_bar) + input_organism, has_sequence = read_anno_file(organism_name=args.organism_name, + filename=args.annot_file, + circular_contigs=[], + pseudo=args.use_pseudo) + + if not has_sequence: + if args.fasta_file: + retrieve_gene_sequences_from_fasta_file( + input_organism, args.fasta_file) + else: + raise Exception("The gff/gbff provided did not have any sequence information, " + "Thus, we do not have the information we need to continue the projection.") + + elif args.fasta_file is not None: + annotate_param_names = ["norna", "kingdom", + "allow_overlap", "prodigal_procedure"] + + annotate_params = manage_annotate_param( + annotate_param_names, pangenome_params.annotate, args.config) + + input_organism = annotate_organism(org_name=args.organism_name, file_name=args.fasta_file, circular_contigs=[], tmpdir=args.tmpdir, + code=args.translation_table, norna=annotate_params.norna, kingdom=annotate_params.kingdom, + overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure) + + else: + raise Exception( + "At least one of --fasta_file or --anno_file must be given") + + # Add input organism in pangenome. This is temporary as the pangenome object is not going to be written. + pangenome.add_organism(input_organism) + + annotate_input_genes_with_pangenome_families(pangenome, input_organism=input_organism, output=output_dir, cpu=args.cpu, + no_defrag=args.no_defrag, identity=args.identity, coverage=args.coverage, tmpdir=args.tmpdir, + translation_table=args.translation_table) + + if predict_rgp: + logging.getLogger().info('Detecting rgp in input genome.') + + logging.getLogger().info("Detecting multigenic families...") + multigenics = pangenome.get_multigenics( + pangenome_params.rgp.dup_margin) + + input_org_rgps = predict_RGP(pangenome, input_organism, persistent_penalty=pangenome_params.rgp.persistent_penalty, variable_gain=pangenome_params.rgp.variable_gain, + min_length=pangenome_params.rgp.min_length, min_score=pangenome_params.rgp.min_score, multigenics=multigenics, + disable_bar=args.disable_prog_bar) + if len(input_org_rgps) == 0: + + logging.getLogger('PPanGGOLiN').info("No RGPs have been found in the input organisms. " + "As a result, spot prediction and RGP output will be skipped.") + + else: + + write_predicted_regions( + input_org_rgps, output=output_dir, compress=False) + + if project_spots and len(input_org_rgps) > 0: + predict_spots_in_input_organism(initial_spots=pangenome.spots, + initial_regions=pangenome.regions, + input_org_rgps=input_org_rgps, + multigenics=multigenics, output=output_dir, + write_graph_flag=args.spot_graph, graph_formats=args.graph_formats, + overlapping_match=pangenome_params.spot.overlapping_match, + set_size=pangenome_params.spot.set_size, + exact_match=pangenome_params.spot.exact_match_size) + + project_and_write_modules(pangenome, input_organism, output_dir) + + def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_organism: Organism, output: Path, cpu: int, no_defrag: bool, - identity: float, coverage: float, tmpdir: Path, - translation_table: int): + identity: float, coverage: float, tmpdir: Path, + translation_table: int): """ Annotate input genes with pangenome gene families and perform clustering. @@ -70,13 +190,14 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org seq_fasta_file = output / f"{input_organism.name}.fasta" with open(seq_fasta_file, "w") as fh_out_faa: - write_gene_sequences_from_annotations(input_organism.genes, fh_out_faa, disable_bar=True) - + write_gene_sequences_from_annotations( + input_organism.genes, fh_out_faa, disable_bar=True) + with tempfile.TemporaryDirectory(dir=tmpdir, prefix="seq_to_pang_tmpdir_") as new_tmpdir: seq_set, _, seqid_to_gene_family = get_seq2pang(pangenome, seq_fasta_file, output, Path(new_tmpdir), cpu, no_defrag, identity=identity, coverage=coverage, is_nucleotide=True, translation_table=translation_table) - + project_and_write_partition(seqid_to_gene_family, seq_set, output) lonely_gene = 0 @@ -90,9 +211,10 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org new_gene_family.add_partition("Cloud") lonely_gene += 1 - logging.getLogger().info(f"The input organism has {lonely_gene}/{input_organism.number_of_genes()} " + logging.getLogger().info(f"The input organism has {lonely_gene}/{input_organism.number_of_genes()} " "genes that do not cluster with any of the gene families in the pangenome.") + def predict_RGP(pangenome: Pangenome, input_organism: Organism, persistent_penalty: int, variable_gain: int, min_length: int, min_score: int, multigenics: float, disable_bar: bool) -> None: @@ -115,9 +237,10 @@ def predict_RGP(pangenome: Pangenome, input_organism: Organism, persistent_penal name_scheme = naming_scheme(pangenome) rgps = compute_org_rgp(input_organism, multigenics, persistent_penalty, variable_gain, min_length, - min_score, naming=name_scheme, disable_bar=disable_bar) + min_score, naming=name_scheme, disable_bar=disable_bar) - logging.getLogger().info(f"{len(rgps)} RGPs have been predicted in the input genomes.") + logging.getLogger().info( + f"{len(rgps)} RGPs have been predicted in the input genomes.") return rgps @@ -132,12 +255,14 @@ def write_predicted_regions(regions: Set[Region], """ fname = output / "plastic_regions.tsv" with write_compressed_or_not(fname, compress) as tab: - fieldnames = ["region", "organism", "contig", "start", "stop", "genes", "contigBorder", "wholeContig"] + fieldnames = ["region", "organism", "contig", "start", + "stop", "genes", "contigBorder", "wholeContig"] writer = csv.DictWriter(tab, fieldnames=fieldnames, delimiter='\t') writer.writeheader() - regions = sorted(regions, key=lambda x: (x.organism.name, x.contig.name, x.start)) + regions = sorted(regions, key=lambda x: ( + x.organism.name, x.contig.name, x.start)) for region in regions: row = { "region": region.name, @@ -163,15 +288,17 @@ def write_rgp_to_spot_table(rgp_to_spots: Dict[Region, Set[str]], output: Path, :param compress: Whether to compress the file. """ fname = output / filename - logging.getLogger('PPanGGOLiN').info(f'Writing RGPs to spot table in {fname}') - + logging.getLogger('PPanGGOLiN').info( + f'Writing RGPs to spot table in {fname}') + with write_compressed_or_not(fname, compress) as tab: fieldnames = ["region", "spot_id"] writer = csv.DictWriter(tab, fieldnames=fieldnames, delimiter='\t') writer.writeheader() - regions = sorted(rgp_to_spots.keys(), key=lambda x: (x.organism.name, x.contig.name, x.start)) + regions = sorted(rgp_to_spots.keys(), key=lambda x: ( + x.organism.name, x.contig.name, x.start)) for region in regions: row = { "region": region.name, @@ -192,24 +319,23 @@ def retrieve_gene_sequences_from_fasta_file(input_organism, fasta_file): with read_compressed_or_not(fasta_file) as currFastaFile: contig_id2deq, _ = read_fasta(input_organism, currFastaFile) - for contig in input_organism.contigs: try: for gene in contig.genes: - gene.add_dna(get_dna_sequence(contig_id2deq[contig.name], gene)) + gene.add_dna(get_dna_sequence( + contig_id2deq[contig.name], gene)) for rna in contig.RNAs: rna.add_dna(get_dna_sequence(contig_id2deq[contig.name], rna)) except KeyError: msg = f"Fasta file for input_organism {input_organism.name} did not have the contig {contig.name} " \ - f"that was read from the annotation file. " + f"that was read from the annotation file. " msg += f"The provided contigs in the fasta were : " \ - f"{', '.join([contig for contig in contig_id2deq.keys()])}." + f"{', '.join([contig for contig in contig_id2deq.keys()])}." raise KeyError(msg) - -def manage_annotate_param(annotate_param_names: List[str], pangenome_args: argparse.Namespace, +def manage_annotate_param(annotate_param_names: List[str], pangenome_args: argparse.Namespace, config_file: Optional[str]) -> argparse.Namespace: """ Manage annotate parameters by collecting them from different sources and merging them. @@ -227,7 +353,8 @@ def manage_annotate_param(annotate_param_names: List[str], pangenome_args: argpa config_annotate_args = argparse.Namespace() else: config = defaultdict(dict, parse_config_file(config_file)) - config_annotate_args = get_config_args('annotate', annotate_subparser, config, "annotate", annotate_param_names, strict_config_check=False) + config_annotate_args = get_config_args( + 'annotate', annotate_subparser, config, "annotate", annotate_param_names, strict_config_check=False) annotate_param_from_pangenome = {} annotate_param_from_config = {} @@ -238,165 +365,42 @@ def manage_annotate_param(annotate_param_names: List[str], pangenome_args: argpa # Collecting annotate parameters from different sources for annotate_arg in annotate_param_names: if hasattr(pangenome_args, annotate_arg): - param_val = getattr(pangenome_args, annotate_arg) + param_val = getattr(pangenome_args, annotate_arg) annotate_param_from_pangenome[annotate_arg] = param_val setattr(annotate_params, annotate_arg, param_val) elif hasattr(config_annotate_args, annotate_arg): - param_val = getattr(config_annotate_args, annotate_arg) + param_val = getattr(config_annotate_args, annotate_arg) annotate_param_from_config[annotate_arg] = param_val setattr(annotate_params, annotate_arg, param_val) else: - param_val = getattr(default_annotate_args, annotate_arg) + param_val = getattr(default_annotate_args, annotate_arg) annotate_param_from_default[annotate_arg] = param_val setattr(annotate_params, annotate_arg, param_val) # Log the sources of the annotate parameters if len(annotate_param_from_pangenome) > 0: - param_val_string = ' '.join([f'--{k} {v}' for k, v in annotate_param_from_pangenome.items()]) + param_val_string = ' '.join( + [f'--{k} {v}' for k, v in annotate_param_from_pangenome.items()]) logging.getLogger("PPanGGOLiN").debug(f"{len(annotate_param_from_pangenome)}/{len(annotate_param_names)} annotate parameters extracted from pangenome parameters " f"(the parameters used to build the input pangenome): {param_val_string}") if len(annotate_param_from_config) > 0: - param_val_string = ';'.join([f' {k} : {v}' for k, v in annotate_param_from_config.items()]) + param_val_string = ';'.join( + [f' {k} : {v}' for k, v in annotate_param_from_config.items()]) logging.getLogger("PPanGGOLiN").debug(f"{len(annotate_param_from_config)}/{len(annotate_param_names)} annotate parameters were not found in pangenome internal parameters." - f" They have been parsed from the annotate section in the config file: {param_val_string}") + f" They have been parsed from the annotate section in the config file: {param_val_string}") if len(annotate_param_from_default) > 0: - param_val_string = ';'.join([f' {k} : {v}' for k, v in annotate_param_from_default.items()]) + param_val_string = ';'.join( + [f' {k} : {v}' for k, v in annotate_param_from_default.items()]) logging.getLogger("PPanGGOLiN").debug(f"{len(annotate_param_from_default)}/{len(annotate_param_names)} annotate parameters were not found in the pangenome parameters " - f"nor in the config file. Default values have been used: {param_val_string}") + f"nor in the config file. Default values have been used: {param_val_string}") return annotate_params - - -def launch(args: argparse.Namespace): - """ - Command launcher - - :param args: All arguments provide by user - """ - - - output_dir = Path(args.output) - mk_outdir(output_dir, args.force) - - - # For the moment this element of the pangenome are predicted by default - project_modules = True - predict_rgp = True - project_spots = True - - pangenome = Pangenome() - pangenome.add_file(args.pangenome) - - if pangenome.status["partitioned"] not in ["Computed", "Loaded", "inFile"]: - raise NameError(f"The provided pangenome has not been partitioned. " - "Annotation of an external genome is therefore not possible. " - "See the 'partition' subcommands.") - - if pangenome.status["predictedRGP"] not in ["Computed", "Loaded", "inFile"]: - logging.getLogger().info("RGPs have not been predicted in the provided pangenome. " - "Projection of RGPs into the provided genome will not be performed.") - predict_rgp = False - - if pangenome.status["spots"] not in ["Computed", "Loaded", "inFile"]: - logging.getLogger().info("Spots have not been predicted in the provided pangenome. " - "Projection of spots into the provided genome will not be performed.") - project_spots = False - - - if pangenome.status["modules"] not in ["Computed", "Loaded", "inFile"]: - logging.getLogger().info("Modules have not been predicted in the provided pangenome. " - "Projection of modules into the provided genome will not be performed.") - - project_modules = False - - - check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar, - need_rgp=predict_rgp, need_modules=project_modules, - need_spots=project_spots) - - logging.getLogger().info('Retrieving parameters from the provided pangenome file.') - pangenome_params = argparse.Namespace(**{step:argparse.Namespace(**k_v) for step, k_v in pangenome.parameters.items()}) - - - if args.organism_name in [org.name for org in pangenome.organisms]: - raise NameError(f"The provided organism name '{args.organism_name}' already exists in the given pangenome.") - - if args.annot_file is not None: - # read_annotations(pangenome, args.anno, cpu=args.cpu, pseudo=args.use_pseudo, disable_bar=args.disable_prog_bar) - input_organism, has_sequence = read_anno_file(organism_name = args.organism_name, - filename=args.annot_file, - circular_contigs=[], - pseudo=args.use_pseudo) - - if not has_sequence: - if args.fasta_file: - retrieve_gene_sequences_from_fasta_file(input_organism, args.fasta_file) - else: - raise Exception("The gff/gbff provided did not have any sequence information, " - "Thus, we do not have the information we need to continue the projection.") - - elif args.fasta_file is not None: - annotate_param_names = ["norna", "kingdom", "allow_overlap", "prodigal_procedure"] - - annotate_params = manage_annotate_param(annotate_param_names, pangenome_params.annotate, args.config) - - input_organism = annotate_organism(org_name=args.organism_name, file_name = args.fasta_file, circular_contigs=[], tmpdir=args.tmpdir, - code = args.translation_table, norna=annotate_params.norna, kingdom = annotate_params.kingdom, - overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure) - - else: - raise Exception("At least one of --fasta_file or --anno_file must be given") - - - # Add input organism in pangenome. This is temporary as the pangenome object is not going to be written. - pangenome.add_organism(input_organism) - - annotate_input_genes_with_pangenome_families(pangenome, input_organism=input_organism, output=output_dir, cpu=args.cpu, - no_defrag = args.no_defrag, identity = args.identity, coverage = args.coverage, tmpdir=args.tmpdir, - translation_table = args.translation_table) - - if predict_rgp: - logging.getLogger().info('Detecting rgp in input genome.') - - - logging.getLogger().info("Detecting multigenic families...") - multigenics = pangenome.get_multigenics(pangenome_params.rgp.dup_margin) - - input_org_rgps = predict_RGP(pangenome, input_organism, persistent_penalty=pangenome_params.rgp.persistent_penalty, variable_gain=pangenome_params.rgp.variable_gain, - min_length=pangenome_params.rgp.min_length, min_score=pangenome_params.rgp.min_score, multigenics=multigenics, - disable_bar=args.disable_prog_bar) - if len(input_org_rgps) == 0: - - logging.getLogger('PPanGGOLiN').info("No RGPs have been found in the input organisms. " - "As a result, spot prediction and RGP output will be skipped.") - - - else: - - write_predicted_regions(input_org_rgps, output=output_dir, compress=False) - - if project_spots and len(input_org_rgps) > 0: - predict_spots_in_input_organism(initial_spots=pangenome.spots, - initial_regions=pangenome.regions, - input_org_rgps=input_org_rgps, - multigenics=multigenics, output=output_dir, - write_graph_flag=args.spot_graph, graph_formats=args.graph_formats, - overlapping_match=pangenome_params.spot.overlapping_match, - set_size=pangenome_params.spot.set_size, - exact_match=pangenome_params.spot.exact_match_size) - - - - - project_and_write_modules(pangenome, input_organism, output_dir) - - def check_spots_congruency(graph_spot: nx.Graph, spots: List[Spot]) -> None: """ Check congruency of spots in the spot graph with the original spots. @@ -413,10 +417,11 @@ def check_spots_congruency(graph_spot: nx.Graph, spots: List[Spot]) -> None: regions_in_cc = set() for node in cc: regions_in_cc |= graph_spot.nodes[node]["rgp"] - + # check that region in cc are the regions of a spot spot_in_cc = {rgp_to_spot[rgp] for rgp in regions_in_cc} - assert len(spot_in_cc) == 1, "More than one spot in a connected_components. Something went wrong when recomputing spots." + assert len( + spot_in_cc) == 1, "More than one spot in a connected_components. Something went wrong when recomputing spots." current_spot = spot_in_cc.pop() # Add spot id to the graph for node in cc: @@ -424,10 +429,9 @@ def check_spots_congruency(graph_spot: nx.Graph, spots: List[Spot]) -> None: graph_spot.nodes[node]["spots"] = {current_spot} - -def predict_spots_in_input_organism(initial_spots: List[Spot], initial_regions: List[Region], - input_org_rgps: List[Region], multigenics: Set[GeneFamily], output: str, - write_graph_flag: bool = False, graph_formats: List[str] = ['gexf'], +def predict_spots_in_input_organism(initial_spots: List[Spot], initial_regions: List[Region], + input_org_rgps: List[Region], multigenics: Set[GeneFamily], output: str, + write_graph_flag: bool = False, graph_formats: List[str] = ['gexf'], overlapping_match: int = 2, set_size: int = 3, exact_match: int = 1) -> Dict: """ Create a spot graph from pangenome RGP and predict spots for input organism RGPs. @@ -448,15 +452,14 @@ def predict_spots_in_input_organism(initial_spots: List[Spot], initial_regions: logging.getLogger("PPanGGOLiN").info(f"Rebuilding spot graph.") graph_spot = make_spot_graph(rgps=initial_regions, multigenics=multigenics, - overlapping_match=overlapping_match, set_size=set_size, exact_match=exact_match) - + overlapping_match=overlapping_match, set_size=set_size, exact_match=exact_match) + original_nodes = set(graph_spot.nodes) # Check congruency with already computed spot and add spot id in node attributes check_spots_congruency(graph_spot, initial_spots) - - # Check which input RGP has a spot + # Check which input RGP has a spot lost = 0 used = 0 @@ -470,39 +473,40 @@ def predict_spots_in_input_organism(initial_spots: List[Spot], initial_regions: used += 1 border_node = add_new_node_in_spot_graph(graph_spot, rgp, border) input_org_node_to_rgps[border_node].add(rgp) - + if len(input_org_node_to_rgps) == 0: logging.getLogger("PPanGGOLiN").info(f"No RGPs of the input organism will be associated with any spot of insertion " - "as they are on a contig border (or have " - f"less than {set_size} persistent gene families until the contig border). " - "Projection of spots stops here") + "as they are on a contig border (or have " + f"less than {set_size} persistent gene families until the contig border). " + "Projection of spots stops here") return {} - # remove node that were already in the graph - new_nodes = set(input_org_node_to_rgps) - original_nodes - + # remove node that were already in the graph + new_nodes = set(input_org_node_to_rgps) - original_nodes + logging.getLogger("PPanGGOLiN").info(f"{lost} RGPs of the input organism won't be associated with any spot of insertion " "as they are on a contig border (or have " f"less than {set_size} persistent gene families until the contig border)") - - logging.getLogger("PPanGGOLiN").info(f"{used} RGPs of the input organism will be associated to a spot of insertion") + + logging.getLogger("PPanGGOLiN").info( + f"{used} RGPs of the input organism will be associated to a spot of insertion") # add potential edges from new nodes to the rest of the nodes - all_nodes = list(graph_spot.nodes) + all_nodes = list(graph_spot.nodes) for nodei in new_nodes: for nodej in all_nodes: if nodei == nodej: continue node_obj_i = graph_spot.nodes[nodei] node_obj_j = graph_spot.nodes[nodej] - if check_sim([node_obj_i["border0"], node_obj_i["border1"]], + if check_sim([node_obj_i["border0"], node_obj_i["border1"]], [node_obj_j["border0"], node_obj_j["border1"]], overlapping_match, set_size, exact_match): graph_spot.add_edge(nodei, nodej) - + input_rgp_to_spots = {} new_spots = [] - new_spot_id_counter = max((s.ID for s in initial_spots)) + 1 + new_spot_id_counter = max((s.ID for s in initial_spots)) + 1 # determine spot ids of the new nodes and by extension to their rgps for comp in nx.algorithms.components.connected_components(graph_spot): # in very rare case one cc can have several original spots @@ -511,7 +515,8 @@ def predict_spots_in_input_organism(initial_spots: List[Spot], initial_regions: spots_of_the_cc = set() for node in comp: if "spots" in graph_spot.nodes[node]: - spots_of_the_cc |= {spot for spot in graph_spot.nodes[node]["spots"]} + spots_of_the_cc |= { + spot for spot in graph_spot.nodes[node]["spots"]} if len(spots_of_the_cc) == 0: # no spot associated with any node of the cc @@ -519,51 +524,55 @@ def predict_spots_in_input_organism(initial_spots: List[Spot], initial_regions: # let's add a new spot id new_spot = NewSpot(new_spot_id_counter) new_spots.append(new_spot) - spots_of_the_cc = {new_spot} # {f"new_spot_{new_spot_id_counter}"} + spots_of_the_cc = {new_spot} # {f"new_spot_{new_spot_id_counter}"} new_spot_id_counter += 1 elif len(spots_of_the_cc) > 1: - # more than one spot in the cc + # more than one spot in the cc logging.getLogger("PPanGGOLiN").info('Some RGPs of the input organism ' f"are connected to {len(spots_of_the_cc)} original spots of the pangenome.") - + input_rgps_of_the_cc = set() for node in comp: if node in input_org_node_to_rgps: input_rgps_of_the_cc |= input_org_node_to_rgps[node] - + if write_graph_flag: graph_spot.nodes[node]["spots"] = spots_of_the_cc - graph_spot.nodes[node]["spot_id"] = ';'.join((str(spot) for spot in spots_of_the_cc)) + graph_spot.nodes[node]["spot_id"] = ';'.join( + (str(spot) for spot in spots_of_the_cc)) graph_spot.nodes[node]["includes_RGPs_from_the_input_organism"] = True for spot in spots_of_the_cc: spot.add_regions(input_rgps_of_the_cc) - input_rgp_to_spots.update({rgp:spots_of_the_cc for rgp in input_rgps_of_the_cc}) - + input_rgp_to_spots.update( + {rgp: spots_of_the_cc for rgp in input_rgps_of_the_cc}) + if write_graph_flag: for node in graph_spot.nodes: del graph_spot.nodes[node]["spots"] - write_spot_graph(graph_spot, output, graph_formats, file_basename='projected_spotGraph') - - - write_rgp_to_spot_table(input_rgp_to_spots, output=output, filename='input_organism_rgp_to_spot.tsv') + write_spot_graph(graph_spot, output, graph_formats, + file_basename='projected_spotGraph') + write_rgp_to_spot_table(input_rgp_to_spots, output=output, + filename='input_organism_rgp_to_spot.tsv') - new_spots = {spot for spots in input_rgp_to_spots.values() for spot in spots if type(spot) == NewSpot} + new_spots = {spot for spots in input_rgp_to_spots.values() + for spot in spots if type(spot) == NewSpot} if new_spots: - logging.getLogger('PPanGGOLiN').info(f'{len(new_spots)} new spots have been created for the input genome.') - summarize_spots(new_spots, output, compress = False, file_name="new_spots_summary.tsv") - + logging.getLogger('PPanGGOLiN').info( + f'{len(new_spots)} new spots have been created for the input genome.') + summarize_spots(new_spots, output, compress=False, + file_name="new_spots_summary.tsv") return input_rgp_to_spots -def project_and_write_modules(pangenome:Pangenome, input_organism: Organism, output:Path, compress:bool=False): +def project_and_write_modules(pangenome: Pangenome, input_organism: Organism, output: Path, compress: bool = False): """ Write a tsv file providing association between modules and the input organism @@ -579,15 +588,19 @@ def project_and_write_modules(pangenome:Pangenome, input_organism: Organism, out fout.write("module_id\torganism\tcompletion\n") for mod in pangenome.modules: - module_in_input_organism = any((fam in input_organism_families for fam in mod.families)) + module_in_input_organism = any( + (fam in input_organism_families for fam in mod.families)) if module_in_input_organism: counter += 1 - completion = round(len(input_organism.families & mod.families) / len(mod.families), 2) - fout.write(f"module_{mod.ID}\t{input_organism.name}\t{completion}\n") + completion = round( + len(input_organism.families & mod.families) / len(mod.families), 2) + fout.write( + f"module_{mod.ID}\t{input_organism.name}\t{completion}\n") - logging.getLogger().info(f"{counter} modules have been projected to the input genomes.") + logging.getLogger().info( + f"{counter} modules have been projected to the input genomes.") logging.getLogger().info( f"Projected modules have been written in: '{output_file}'") @@ -601,7 +614,8 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser :return : parser arguments for align command """ - parser = sub_parser.add_parser("projection", formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser = sub_parser.add_parser( + "projection", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser_projection(parser) return parser @@ -614,18 +628,19 @@ def parser_projection(parser: argparse.ArgumentParser): """ required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") - required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome.h5 file") - + required.add_argument('-p', '--pangenome', required=False, + type=Path, help="The pangenome.h5 file") + required.add_argument("-n", '--organism_name', required=False, type=str, - help="Name of the input_organism whose genome is being annotated with the provided pangenome.") - + help="Name of the input_organism whose genome is being annotated with the provided pangenome.") + required.add_argument('--fasta_file', required=False, type=Path, - help="The filepath of the genomic sequence(s) in FASTA format if the genome to annotate. " - "(Fasta file can be compressed with gzip)") + help="The filepath of the genomic sequence(s) in FASTA format if the genome to annotate. " + "(Fasta file can be compressed with gzip)") required.add_argument('--annot_file', required=False, type=Path, - help="The filepath of the annotations in GFF/GBFF format for the genome to annotate with the provided pangenome. " - "(Annotation file can be compressed with gzip)") + help="The filepath of the annotations in GFF/GBFF format for the genome to annotate with the provided pangenome. " + "(Annotation file can be compressed with gzip)") optional = parser.add_argument_group(title="Optional arguments") @@ -633,32 +648,33 @@ def parser_projection(parser: argparse.ArgumentParser): default="ppanggolin_output" + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", time.localtime()) + "_PID" + str(os.getpid()), help="Output directory") - + optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()), - help="directory for storing temporary files") - + help="directory for storing temporary files") + optional.add_argument('--no_defrag', required=False, action="store_true", help="DO NOT Realign gene families to link fragments with" "their non-fragmented gene family. (default: False)") - + optional.add_argument('--identity', required=False, type=restricted_float, default=0.5, help="min identity percentage threshold") - + optional.add_argument('--coverage', required=False, type=restricted_float, default=0.8, help="min coverage percentage threshold") - + optional.add_argument("--translation_table", required=False, default="11", help="Translation table (genetic code) to use.") - + optional.add_argument("--use_pseudo", required=False, action="store_true", help="In the context of provided annotation, use this option to read pseudogenes. " "(Default behavior is to ignore them)") - + optional.add_argument("--spot_graph", required=False, action="store_true", - help="Write the spot graph to a file, with pairs of blocks of single copy markers flanking RGPs " - "as nodes. This graph can be used to visualize nodes that have RGPs from the input organism.") - + help="Write the spot graph to a file, with pairs of blocks of single copy markers flanking RGPs " + "as nodes. This graph can be used to visualize nodes that have RGPs from the input organism.") + optional.add_argument('--graph_formats', required=False, type=str, choices=['gexf', "graphml"], nargs="+", - default=['gexf'], help="Format of the output graph.") - - optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") + default=['gexf'], help="Format of the output graph.") + + optional.add_argument("-c", "--cpu", required=False, + default=1, type=int, help="Number of available cpus") From 01619b36cbc238f011678888878b8545b6ed774c Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 10 Aug 2023 17:21:48 +0200 Subject: [PATCH 067/173] add contig length in contig object --- ppanggolin/annotate/annotate.py | 8 ++++++++ ppanggolin/annotate/synta.py | 2 ++ ppanggolin/genome.py | 15 +++++++++++++++ 3 files changed, 25 insertions(+) diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 5491a9d4..f7bc33d6 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -232,9 +232,13 @@ def read_org_gbff(organism: str, gbff_file_path: str, circular_contigs: list, ps while not line.startswith('//'): sequence += line[10:].replace(" ", "").strip().upper() line = lines.pop() + + contig.add_contig_length(len(sequence)) + # get each gene's sequence. for gene in contig.genes: gene.add_dna(get_dna_sequence(sequence, gene)) + return org, True @@ -364,7 +368,11 @@ def get_id_attribute(attributes_dict: dict) -> str: # GET THE FASTA SEQUENCES OF THE GENES if has_fasta and fasta_string != "": contig_sequences, _ = read_fasta(org, fasta_string.split('\n')) # _ is total contig length + for contig in org.contigs: + + contig.add_contig_length(len(contig_sequences[contig.name])) + for gene in contig.genes: gene.add_dna(get_dna_sequence(contig_sequences[contig.name], gene)) for rna in contig.RNAs: diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 23d5c195..1363c267 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -328,6 +328,8 @@ def annotate_organism(org_name: str, file_name: str, circular_contigs, tmpdir: s for contigName, genes in genes.items(): contig = org.get_contig(contigName) + contig.add_contig_length(len(contig_sequences[contig.name])) + if contig.name in circular_contigs: contig.is_circular = True for gene in genes: diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 89424621..00400995 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -155,6 +155,7 @@ def __init__(self, name: str, is_circular: bool = False): self.RNAs = set() # saving the rna annotations. We're not using them in the vast majority of cases. self._genes_start = {} self._genes_position = [] + self.length = None @property def genes(self) -> list: @@ -204,6 +205,19 @@ def add_gene(self, gene: Gene): self._genes_position[gene.position] = gene self._genes_start[gene.start] = gene + def add_contig_length(self, contig_length: int): + """ + Add contig length to Contig object. + + :param contig_length: Length of the contig. + :raises ValueError: If trying to define a contig length different than previously defined. + """ + if self.length is None: + self.length = contig_length + + elif self.length != contig_length: + raise ValueError('Attempting to define a contig length different from the previously defined value.') + class Organism(MetaFeatures): """ @@ -294,3 +308,4 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): self.bitarray[index[fam]] = 1 else: raise Exception("There is not any partition corresponding please report a github issue") + From b83dec835884ebb4ac30bb6d506f4387ede312b6 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 11 Aug 2023 10:50:32 +0200 Subject: [PATCH 068/173] improve summary --- ppanggolin/projection/projection.py | 98 ++++++++++++++++++++++++----- 1 file changed, 84 insertions(+), 14 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 51547ab3..7db6e86b 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -32,7 +32,7 @@ from ppanggolin.RGP.spot import make_spot_graph, check_sim, add_new_node_in_spot_graph, write_spot_graph from ppanggolin.genome import Organism, Gene, RNA, Contig from ppanggolin.geneFamily import GeneFamily -from ppanggolin.region import Region, Spot +from ppanggolin.region import Region, Spot, Module from ppanggolin.formats.writeFlat import summarize_spots @@ -71,10 +71,11 @@ def launch(args: argparse.Namespace): if pangenome.status["predictedRGP"] not in ["Computed", "Loaded", "inFile"]: logging.getLogger().info("RGPs have not been predicted in the provided pangenome. " - "Projection of RGPs into the provided genome will not be performed.") + "Projection of RGPs and spots into the provided genome will not be performed.") predict_rgp = False + project_spots = False - if pangenome.status["spots"] not in ["Computed", "Loaded", "inFile"]: + elif pangenome.status["spots"] not in ["Computed", "Loaded", "inFile"]: logging.getLogger().info("Spots have not been predicted in the provided pangenome. " "Projection of spots into the provided genome will not be performed.") project_spots = False @@ -130,10 +131,12 @@ def launch(args: argparse.Namespace): # Add input organism in pangenome. This is temporary as the pangenome object is not going to be written. pangenome.add_organism(input_organism) - annotate_input_genes_with_pangenome_families(pangenome, input_organism=input_organism, output=output_dir, cpu=args.cpu, + singleton_gene_count = annotate_input_genes_with_pangenome_families(pangenome, input_organism=input_organism, output=output_dir, cpu=args.cpu, no_defrag=args.no_defrag, identity=args.identity, coverage=args.coverage, tmpdir=args.tmpdir, translation_table=args.translation_table) + input_org_rgps, input_org_spots, input_org_modules = None, None, None + if predict_rgp: logging.getLogger().info('Detecting rgp in input genome.') @@ -155,7 +158,7 @@ def launch(args: argparse.Namespace): input_org_rgps, output=output_dir, compress=False) if project_spots and len(input_org_rgps) > 0: - predict_spots_in_input_organism(initial_spots=pangenome.spots, + input_org_spots = predict_spots_in_input_organism(initial_spots=pangenome.spots, initial_regions=pangenome.regions, input_org_rgps=input_org_rgps, multigenics=multigenics, output=output_dir, @@ -164,8 +167,67 @@ def launch(args: argparse.Namespace): set_size=pangenome_params.spot.set_size, exact_match=pangenome_params.spot.exact_match_size) - project_and_write_modules(pangenome, input_organism, output_dir) + if project_modules: + input_org_modules = project_and_write_modules(pangenome, input_organism, output_dir) + + summarize_projection(input_organism, pangenome, input_org_rgps, input_org_spots, input_org_modules, singleton_gene_count ) + + + +def summarize_projection(input_organism:Organism, pangenome:Pangenome, input_org_rgps:Region, + input_org_spots:Spot, input_org_modules:Module, singleton_gene_count:int ): + """ + + :param singleton_gene_count: Number of genes that do not cluster with any of the gene families of the pangenome. + """ + + partition_to_gene = defaultdict(set) + contigs_count = 0 + for contig in input_organism.contigs: + contigs_count += 1 + for gene in contig.genes: + partition_to_gene[gene.family.named_partition].add(gene) + + persistent_gene_count = len(partition_to_gene['persistent']) + shell_gene_count = len(partition_to_gene['shell']) + cloud_gene_count = len(partition_to_gene['cloud']) + + gene_count = persistent_gene_count + shell_gene_count + cloud_gene_count + + persistent_family_count = len({g.family for g in partition_to_gene['persistent']}) + shell_family_count = len({g.family for g in partition_to_gene['shell']}) + cloud_family_count = len({g.family for g in partition_to_gene['cloud']}) + + families_count = persistent_family_count + shell_family_count + cloud_family_count + + rgp_count = "Not computed" if input_org_rgps is None else len(input_org_rgps) + spot_count = "Not computed" if input_org_spots is None else len(input_org_spots) + new_spot_count = "Not computed" if input_org_spots is None else sum(1 for spot in input_org_spots if isinstance(spot, NewSpot)) + module_count = "Not computed" if input_org_modules is None else len(input_org_modules) + + summary_info = [ + ("Organism name", input_organism.name), + ("Pangenome file", pangenome.file), + ("Contigs", contigs_count), + ("Genes", gene_count), + ("Families", families_count), + ("Singleton families", singleton_gene_count), + ("Persistent families", persistent_family_count), + ("Persistent genes", persistent_gene_count), + ("Shell families", shell_family_count), + ("Shell genes", shell_gene_count), + ("Cloud families", cloud_family_count), + ("Cloud genes", cloud_gene_count), + ("RGPs", rgp_count), + ("Spots", spot_count), + ("New spots", new_spot_count), + ("Modules", module_count) + ] + + summary_str = '\n'.join((f' - {k}: {v}' for k,v in summary_info )) + print('Projection_summary:') + print(summary_str) def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_organism: Organism, output: Path, cpu: int, no_defrag: bool, identity: float, coverage: float, tmpdir: Path, @@ -184,7 +246,7 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org :param disable_bar: Whether to disable progress bar. :param translation_table: Translation table ID for nucleotide sequences. - :return: None + :return: Number of genes that do not cluster with any of the gene families of the pangenome. """ seq_fasta_file = output / f"{input_organism.name}.fasta" @@ -213,6 +275,7 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org logging.getLogger().info(f"The input organism has {lonely_gene}/{input_organism.number_of_genes()} " "genes that do not cluster with any of the gene families in the pangenome.") + return lonely_gene def predict_RGP(pangenome: Pangenome, input_organism: Organism, persistent_penalty: int, variable_gain: int, @@ -560,19 +623,23 @@ def predict_spots_in_input_organism(initial_spots: List[Spot], initial_regions: write_rgp_to_spot_table(input_rgp_to_spots, output=output, filename='input_organism_rgp_to_spot.tsv') - new_spots = {spot for spots in input_rgp_to_spots.values() - for spot in spots if type(spot) == NewSpot} + input_org_spots = {spot for spots in input_rgp_to_spots.values() + for spot in spots } + new_spots = {spot for spot in input_org_spots if type(spot) == NewSpot} + + + logging.getLogger('PPanGGOLiN').info( + f'{len(new_spots)} new spots have been created for the input genome.') if new_spots: - logging.getLogger('PPanGGOLiN').info( - f'{len(new_spots)} new spots have been created for the input genome.') summarize_spots(new_spots, output, compress=False, file_name="new_spots_summary.tsv") - return input_rgp_to_spots + return input_org_spots -def project_and_write_modules(pangenome: Pangenome, input_organism: Organism, output: Path, compress: bool = False): +def project_and_write_modules(pangenome: Pangenome, input_organism: Organism, + output: Path, compress: bool = False): """ Write a tsv file providing association between modules and the input organism @@ -584,6 +651,7 @@ def project_and_write_modules(pangenome: Pangenome, input_organism: Organism, ou input_organism_families = input_organism.families counter = 0 + modules_in_input_org = [] with write_compressed_or_not(output_file, compress) as fout: fout.write("module_id\torganism\tcompletion\n") @@ -593,6 +661,7 @@ def project_and_write_modules(pangenome: Pangenome, input_organism: Organism, ou if module_in_input_organism: counter += 1 + modules_in_input_org.append(mod) completion = round( len(input_organism.families & mod.families) / len(mod.families), 2) @@ -604,7 +673,8 @@ def project_and_write_modules(pangenome: Pangenome, input_organism: Organism, ou logging.getLogger().info( f"Projected modules have been written in: '{output_file}'") - + + return modules_in_input_org def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: """ From 4e969c7c850b15e56b46f12f67f4adf5f6da48c3 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 11 Aug 2023 10:50:56 +0200 Subject: [PATCH 069/173] fix tmpdir issues --- ppanggolin/align/alignOnPang.py | 23 +++++++++++------------ ppanggolin/projection/projection.py | 4 ++++ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 19a9f9e8..8925fee6 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -63,28 +63,27 @@ def align_seq_to_pang(pang_file: IO, seq_file: TextIOWrapper, output: Path, cov_mode = "0" # coverage of query and target with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), prefix="aln_result_db_file") as aln_db: - cmd = ["mmseqs", "search", seq_db.name, pang_db.name, aln_db.name, tmpdir.name, "-a", "--min-seq-id", str(identity), + cmd = ["mmseqs", "search", seq_db.name, pang_db.name, aln_db.name, tmpdir.as_posix(), "-a", "--min-seq-id", str(identity), "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu)] if is_nucleotid: logging.getLogger().debug(f"Input sequences will be translated by mmseqs with translation table {translation_table}") cmd += ["--translation-table", f"{translation_table}", "--translate", "0" ] - - logging.getLogger().info("Aligning sequences to cluster representatives...") - logging.getLogger().debug(" ".join(cmd)) - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.name, prefix="aln_result_db_file", suffix = ".tsv", delete=False) as outfile: - cmd = ["mmseqs", "convertalis", seq_db.name, pang_db.name, aln_db.name, outfile.name, "--format-mode", "2"] - - logging.getLogger().info("Extracting alignments...") + + logging.getLogger().info("Aligning sequences to cluster representatives...") logging.getLogger().debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, prefix="aln_result_db_file", suffix = ".tsv", delete=False) as outfile: + cmd = ["mmseqs", "convertalis", seq_db.name, pang_db.name, aln_db.name, outfile.name, "--format-mode", "2"] + + logging.getLogger().info("Extracting alignments...") + logging.getLogger().debug(" ".join(cmd)) + subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + pang_db.close() seq_db.close() - aln_db.close() return outfile.name @@ -310,8 +309,8 @@ def get_seq2pang(pangenome: Pangenome, sequence_file: Path, output: Path, tmpdir :return: sequence set, blast-tab result file string, and sequences aligned with families """ - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, suffix=".faa") as tmp_pang_file: + with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, suffix=".faa") as tmp_pang_file: write_gene_fam_sequences(pangenome, tmp_pang_file, add="ppanggolin_") with read_compressed_or_not(sequence_file) as seqFileObj: diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 7db6e86b..23faf065 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -254,8 +254,12 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org with open(seq_fasta_file, "w") as fh_out_faa: write_gene_sequences_from_annotations( input_organism.genes, fh_out_faa, disable_bar=True) + + # create tmpdir + mk_outdir(tmpdir, force=True) with tempfile.TemporaryDirectory(dir=tmpdir, prefix="seq_to_pang_tmpdir_") as new_tmpdir: + seq_set, _, seqid_to_gene_family = get_seq2pang(pangenome, seq_fasta_file, output, Path(new_tmpdir), cpu, no_defrag, identity=identity, coverage=coverage, is_nucleotide=True, translation_table=translation_table) From b0be441914996902724dde06e5d0e6a738030779 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 11 Aug 2023 11:00:38 +0200 Subject: [PATCH 070/173] improve docstring and improve spot count summary --- ppanggolin/RGP/genomicIsland.py | 3 ++- ppanggolin/projection/projection.py | 25 +++++++++++++++---------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index 30ff9480..2bba4591 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -204,7 +204,8 @@ def compute_org_rgp( organism: Organism, multigenics: set, :param min_score: Minimum score threshold for considering a region as RGP (default: 4). :param naming: Naming scheme for the regions, either "contig" or "organism" (default: "contig"). :param disable_bar: Whether to disable the progress bar. It is recommended to disable it when calling this function in a loop on multiple organisms (default: True). - :return: A set of organization regions representing the predicted RGPs. + + :return: A set of RGPs of the provided organism. """ org_regions = set() for contig in tqdm(organism.contigs, total=len(organism.contigs), unit="contig", disable=disable_bar): diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 23faf065..7b1bf280 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -157,15 +157,20 @@ def launch(args: argparse.Namespace): write_predicted_regions( input_org_rgps, output=output_dir, compress=False) - if project_spots and len(input_org_rgps) > 0: + if predict_rgp and project_spots: + if len(input_org_rgps) == 0: + # if rgp and spot flag are on but no RGP has been found + # then no spot will be found + input_org_spots = {} + else: input_org_spots = predict_spots_in_input_organism(initial_spots=pangenome.spots, - initial_regions=pangenome.regions, - input_org_rgps=input_org_rgps, - multigenics=multigenics, output=output_dir, - write_graph_flag=args.spot_graph, graph_formats=args.graph_formats, - overlapping_match=pangenome_params.spot.overlapping_match, - set_size=pangenome_params.spot.set_size, - exact_match=pangenome_params.spot.exact_match_size) + initial_regions=pangenome.regions, + input_org_rgps=input_org_rgps, + multigenics=multigenics, output=output_dir, + write_graph_flag=args.spot_graph, graph_formats=args.graph_formats, + overlapping_match=pangenome_params.spot.overlapping_match, + set_size=pangenome_params.spot.set_size, + exact_match=pangenome_params.spot.exact_match_size) if project_modules: input_org_modules = project_and_write_modules(pangenome, input_organism, output_dir) @@ -255,7 +260,7 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org write_gene_sequences_from_annotations( input_organism.genes, fh_out_faa, disable_bar=True) - # create tmpdir + # create tmpdir in case it does not exists mk_outdir(tmpdir, force=True) with tempfile.TemporaryDirectory(dir=tmpdir, prefix="seq_to_pang_tmpdir_") as new_tmpdir: @@ -297,7 +302,7 @@ def predict_RGP(pangenome: Pangenome, input_organism: Organism, persistent_penal :param multigenics: multigenic families. :param disable_bar: Flag to disable the progress bar. - :return: None + :return: Set of RGPs """ logging.getLogger().info("Computing Regions of Genomic Plasticity...") From 8dfb1acee50bf73752afe969572cd91896357b6e Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 11 Aug 2023 12:02:22 +0200 Subject: [PATCH 071/173] write and read contig length --- ppanggolin/formats/readBinaries.py | 9 +++++++-- ppanggolin/formats/writeBinaries.py | 10 ++++++---- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 1fc9a5ac..db425919 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -225,7 +225,7 @@ def get_gene_sequences_from_file(filename: str, file_obj: TextIO, list_cds: iter h5f.close() -def read_organism(pangenome: Pangenome, org_name: str, contig_dict: dict, circular_contigs: dict, genedata_dict: dict, +def read_organism(pangenome: Pangenome, org_name: str, contig_dict: dict, circular_contigs: dict, contig_to_length, genedata_dict: dict, link: bool = False): """ Read information from pangenome to assign to organism object @@ -240,6 +240,8 @@ def read_organism(pangenome: Pangenome, org_name: str, contig_dict: dict, circul gene, gene_type = (None, None) for contigName, geneList in contig_dict.items(): contig = org.get_contig(contigName, is_circular=circular_contigs[contigName]) + contig.add_contig_length(contig_to_length[contigName]) + for row in geneList: if link: # if the gene families are already computed/loaded the gene exists. gene = pangenome.get_gene(row["ID"].decode()) @@ -436,6 +438,7 @@ def read_annotation(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = table = annotations.genes pangenome_dict = {} circular_contigs = {} + contig_lengths = {} genedata_dict = read_genedata(h5f) @@ -449,16 +452,18 @@ def read_annotation(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = # new contig, seen org pangenome_dict[decode_org][row["contig"]["name"].decode()] = [row["gene"]] circular_contigs[decode_org][row["contig"]["name"].decode()] = row["contig"]["is_circular"] + contig_lengths[decode_org][row["contig"]["name"].decode()] = row["contig"]["length"] except KeyError: # new org pangenome_dict[sys.intern(decode_org)] = {row["contig"]["name"].decode(): [row["gene"]]} circular_contigs[decode_org] = {row["contig"]["name"].decode(): row["contig"]["is_circular"]} + contig_lengths[decode_org] = {row["contig"]["name"].decode(): row["contig"]["length"]} link = True if pangenome.status["genesClustered"] in ["Computed", "Loaded"] else False for orgName, contigDict in tqdm(pangenome_dict.items(), total=len(pangenome_dict), unit="organism", disable=disable_bar): - read_organism(pangenome, orgName, contigDict, circular_contigs[orgName], genedata_dict, link) + read_organism(pangenome, orgName, contigDict, circular_contigs[orgName], contig_lengths[orgName], genedata_dict, link) pangenome.status["genomesAnnotated"] = "Loaded" diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index b01a3a35..a2a64852 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -20,12 +20,12 @@ from ppanggolin.formats.readBinaries import read_genedata, Genedata -def gene_desc(org_len, contig_len, id_len, max_local_id) -> dict: +def gene_desc(org_len, contig_name_len, id_len, max_local_id) -> dict: """ Create a table to save gene-related information :param org_len: Maximum size of organism - :param contig_len: Maximum size of contigs + :param contig_name_len: Maximum size of contigs name :param id_len: Maximum size of gene ID :param max_local_id: Maximum size of gene local identifier @@ -35,8 +35,9 @@ def gene_desc(org_len, contig_len, id_len, max_local_id) -> dict: return { 'organism': tables.StringCol(itemsize=org_len), "contig": { - 'name': tables.StringCol(itemsize=contig_len), - "is_circular": tables.BoolCol(dflt=False) + 'name': tables.StringCol(itemsize=contig_name_len), + "is_circular": tables.BoolCol(dflt=False), + "length": tables.UInt32Col(), }, "gene": { 'ID': tables.StringCol(itemsize=id_len), @@ -169,6 +170,7 @@ def write_annotations(pangenome: Pangenome, h5f: tables.File, disable_bar: bool gene_row["organism"] = org.name gene_row["contig/name"] = contig.name gene_row["contig/is_circular"] = contig.is_circular + gene_row["contig/length"] = contig.length gene_row["gene/ID"] = gene.ID gene_row["gene/is_fragment"] = gene.is_fragment if gene.type == "CDS": From 6d1b6fcc2ce42e2d282ef673b9301f07f0a37b53 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 11 Aug 2023 17:00:04 +0200 Subject: [PATCH 072/173] add gff output --- ppanggolin/formats/writeFlat.py | 131 ++++++++++++++++++++++++++++++-- 1 file changed, 124 insertions(+), 7 deletions(-) diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index aa0078bc..7263fbff 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -15,7 +15,8 @@ # local libraries from ppanggolin.edge import Edge from ppanggolin.geneFamily import GeneFamily -from ppanggolin.genome import Organism +from ppanggolin.genome import Organism, Gene, Contig, RNA +from ppanggolin.region import Region, Spot from ppanggolin.pangenome import Pangenome from ppanggolin.utils import write_compressed_or_not, mk_outdir, restricted_float from ppanggolin.formats.readBinaries import check_pangenome_info @@ -618,6 +619,117 @@ def write_projections(output: str, compress: bool = False): write_org_file(org, outdir, compress) logging.getLogger().info("Done writing the projection files") +def write_gff(output: str, compress: bool = False): + """ + Write the gff files for all organisms + + :param output: Path to output directory + :param compress: Compress the file in .gz + """ + logging.getLogger().info("Writing the gff files...") + outdir = output + "/gff" + if not os.path.exists(outdir): + os.makedirs(outdir) + + contig_to_rgp = defaultdict(list) + for rgp in pan.regions: + contig_to_rgp[rgp.contig].append(rgp) + + rgp_to_spot = {rgp:spot for spot in pan.spots for rgp in spot.rgp} + + for org in pan.organisms: + print(org) + write_gff_file(org, outdir, compress, contig_to_rgp, rgp_to_spot) + break + logging.getLogger().info("Done writing the gff files") + +def write_gff_file(org, outdir, compress, contig_to_rgp, rgp_to_spot): + """ + Write the gff file of the provided organism. + + :param org: Organism to write the gff + :param output: Path to output directory + :param compress: Compress the file in .gz + """ + + + # regions = sorted(pan.regions, key=lambda x: (x.organism.name, x.contig.name, x.start)) + + # TODO: decide if we want to keep source information when parsing + # gff/gbff to be able to reinject it in the gff output + + with write_compressed_or_not(outdir + "/" + org.name + ".gff", compress) as outfile: + # write gff header + outfile.write('##gff-version 3\n') + for contig in org.contigs: + if contig.length is None: + raise AttributeError(f'Contig {contig.name} has no length defined.') + + outfile.write(f'##sequence-region {contig.name} 1 {contig.length}\n') + + for contig in org.contigs: + contig_elements = sorted(contig_to_rgp[contig] + contig.genes + list(contig.RNAs), key=lambda x: (x.start)) + + for feature in contig_elements: + + if type(feature) in [Gene, RNA]: + feat_type = feature.type + + strand = feature.strand + + strand = feature.strand + + attributes = [("ID", feature.ID), + ("Name", feature.name), + ("product", feature.product), + ] + + score = '.' + + if type(feature) == Gene: + attributes += [ + ("Family", feature.family.name), + ("Partition", feature.family.named_partition), + ('RGP', ','.join((rgp.name for rgp in feature.RGP))), + ('Module', ','.join((f"module_{module.ID}" for module in feature.family.modules)) ) + ] + # TODO understand why RGP attribute is a set of RGP rather than an uniq RGP + # Does gene can have more than one RGP? + + + elif type(feature) == Region: + feat_type = "region" + strand = "." + score = feature.score # TODO is RGP score make sens and do we want it in gff file? + attributes = [ + ("Name", feature.name), + ("Spot", rgp_to_spot.get(feature.name, "No_spot")) + ] + + + else: + raise TypeError(f'The feature to write in gff file does not have an expected types. {type(feature)}') + + + attributes_str = ';'.join([f"{k}={v}" for k,v in attributes if v != "" and v is not None]) + + line = [contig.name, + ".", # Source + feat_type, + feature.start, + feature.stop, + score, + strand, + ".", + attributes_str, + ] + line_str = '\t'.join(map(str, line)) + outfile.write(line_str + "\n") + + + + + def write_parts(output: str, soft_core: float = 0.95): """ @@ -914,7 +1026,7 @@ def write_rgp_modules(output, compress): def write_flat_files(pangenome: Pangenome, output: str, cpu: int = 1, soft_core: float = 0.95, dup_margin: float = 0.05, csv: bool = False, gene_pa: bool = False, gexf: bool = False, light_gexf: bool = False, - projection: bool = False, stats: bool = False, json: bool = False, partitions: bool = False, + projection: bool = False, gff: bool = False, stats: bool = False, json: bool = False, partitions: bool = False, regions: bool = False, families_tsv: bool = False, spots: bool = False, borders: bool = False, modules: bool = False, spot_modules: bool = False, compress: bool = False, disable_bar: bool = False): @@ -931,6 +1043,7 @@ def write_flat_files(pangenome: Pangenome, output: str, cpu: int = 1, soft_core: :param gexf: write pangenome graph in gexf format :param light_gexf: write pangenome graph with only gene families :param projection: write projection of pangenome for organisms + :param gff: write a gff file with pangenome annotation for each organisms :param stats: write statistics about pangenome :param json: write pangenome graph in json file :param partitions: write the gene families for each partition @@ -944,7 +1057,7 @@ def write_flat_files(pangenome: Pangenome, output: str, cpu: int = 1, soft_core: :param disable_bar: Disable progress bar """ # TODO Add force parameter to check if output already exist - if not any(x for x in [csv, gene_pa, gexf, light_gexf, projection, stats, json, partitions, regions, spots, borders, + if not any(x for x in [csv, gene_pa, gexf, light_gexf, projection, gff, stats, json, partitions, regions, spots, borders, families_tsv, modules, spot_modules]): raise Exception("You did not indicate what file you wanted to write.") @@ -964,10 +1077,10 @@ def write_flat_files(pangenome: Pangenome, output: str, cpu: int = 1, soft_core: pan = pangenome if csv or gene_pa or gexf or light_gexf or projection or stats or json or partitions or regions or spots or \ - families_tsv or borders or modules or spot_modules: + families_tsv or borders or modules or spot_modules or gff: needAnnotations = True needFamilies = True - if projection or stats or partitions or regions or spots or borders: + if projection or stats or partitions or regions or spots or borders or gff: needPartitions = True if gexf or light_gexf or json: needGraph = True @@ -978,7 +1091,7 @@ def write_flat_files(pangenome: Pangenome, output: str, cpu: int = 1, soft_core: metatype = "families" else: needMetadata = False - if regions or spots or borders or spot_modules: + if regions or spots or borders or spot_modules or gff: needRegions = True if spots or borders or spot_modules: # or projection: needSpots = True @@ -1004,6 +1117,8 @@ def write_flat_files(pangenome: Pangenome, output: str, cpu: int = 1, soft_core: processes.append(p.apply_async(func=write_gexf, args=(output, True, soft_core))) if projection: processes.append(p.apply_async(func=write_projections, args=(output, compress))) + if gff: + processes.append(p.apply_async(func=write_gff, args=(output, compress))) if stats: processes.append(p.apply_async(func=write_stats, args=(output, soft_core, dup_margin, compress))) if json: @@ -1040,7 +1155,7 @@ def launch(args: argparse.Namespace): global pan pan.add_file(args.pangenome) write_flat_files(pan, args.output, cpu=args.cpu, soft_core=args.soft_core, dup_margin=args.dup_margin, csv=args.csv, - gene_pa=args.Rtab, gexf=args.gexf, light_gexf=args.light_gexf, projection=args.projection, + gene_pa=args.Rtab, gexf=args.gexf, light_gexf=args.light_gexf, projection=args.projection, gff=args.gff, stats=args.stats, json=args.json, partitions=args.partitions, regions=args.regions, families_tsv=args.families_tsv, spots=args.spots, borders=args.borders, modules=args.modules, spot_modules=args.spot_modules, compress=args.compress, disable_bar=args.disable_prog_bar) @@ -1088,6 +1203,8 @@ def parser_flat(parser: argparse.ArgumentParser): optional.add_argument("--projection", required=False, action="store_true", help="a csv file for each organism providing information on the projection of the graph " "on the organism") + optional.add_argument("--gff", required=False, action="store_true", + help="Generate a gff file for each organism containing pangenome annotations.") optional.add_argument("--stats", required=False, action="store_true", help="tsv files with some statistics for each organism and for each gene family") optional.add_argument("--partitions", required=False, action="store_true", From fc5bca215eccd9f87244ff768e66f4b589d89248 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 11 Aug 2023 17:19:10 +0200 Subject: [PATCH 073/173] fix module and Spot loading --- ppanggolin/formats/writeFlat.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 7263fbff..34684f17 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -635,15 +635,15 @@ def write_gff(output: str, compress: bool = False): for rgp in pan.regions: contig_to_rgp[rgp.contig].append(rgp) - rgp_to_spot = {rgp:spot for spot in pan.spots for rgp in spot.rgp} - + rgp_to_spot_id = {rgp:f"spot_{spot.ID}" for spot in pan.spots for rgp in spot.regions} + for org in pan.organisms: print(org) - write_gff_file(org, outdir, compress, contig_to_rgp, rgp_to_spot) + write_gff_file(org, outdir, compress, contig_to_rgp, rgp_to_spot_id) break logging.getLogger().info("Done writing the gff files") -def write_gff_file(org, outdir, compress, contig_to_rgp, rgp_to_spot): +def write_gff_file(org, outdir, compress, contig_to_rgp, rgp_to_spotid): """ Write the gff file of the provided organism. @@ -677,7 +677,7 @@ def write_gff_file(org, outdir, compress, contig_to_rgp, rgp_to_spot): strand = feature.strand - strand = feature.strand + source = "." attributes = [("ID", feature.ID), ("Name", feature.name), @@ -699,11 +699,13 @@ def write_gff_file(org, outdir, compress, contig_to_rgp, rgp_to_spot): elif type(feature) == Region: feat_type = "region" + source = "ppanggolin" strand = "." score = feature.score # TODO is RGP score make sens and do we want it in gff file? attributes = [ ("Name", feature.name), - ("Spot", rgp_to_spot.get(feature.name, "No_spot")) + ("Spot", rgp_to_spotid.get(feature, "No_spot")), + ("Note", "Region of Genomic Plasticity (RGP)") ] @@ -714,7 +716,7 @@ def write_gff_file(org, outdir, compress, contig_to_rgp, rgp_to_spot): attributes_str = ';'.join([f"{k}={v}" for k,v in attributes if v != "" and v is not None]) line = [contig.name, - ".", # Source + source, # Source feat_type, feature.start, feature.stop, @@ -1091,13 +1093,13 @@ def write_flat_files(pangenome: Pangenome, output: str, cpu: int = 1, soft_core: metatype = "families" else: needMetadata = False - if regions or spots or borders or spot_modules or gff: + if regions or spots or borders or spot_modules: needRegions = True if spots or borders or spot_modules: # or projection: needSpots = True if modules or spot_modules: # or projection: needModules = True - if projection: + if projection or gff: needRegions = True if pan.status["predictedRGP"] == "inFile" else False needSpots = True if pan.status["spots"] == "inFile" else False needModules = True if pan.status["modules"] == "inFile" else False From 06f5739da7daa32e545d761f25c50c28c7142089 Mon Sep 17 00:00:00 2001 From: Jean MAINGUY Date: Thu, 24 Aug 2023 17:59:50 +0200 Subject: [PATCH 074/173] correct error in singleton seq management --- ppanggolin/align/alignOnPang.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 8925fee6..2e231ef8 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -129,7 +129,7 @@ def get_seq(seq_file: TextIOWrapper) -> Set[str]: seqset = set() for line in seq_file: if line.startswith(">"): - seqset.add(line[1:]) + seqset.add(line[1:].split()[0].strip()) return seqset @@ -162,7 +162,7 @@ def project_and_write_partition(seqid_to_gene_family: Dict[str, GeneFamily], seq with open(partition_proj, "w") as partProjFile: for input_seq, pangFam in seqid_to_gene_family.items(): partProjFile.write(input_seq + "\t" + pangFam.named_partition + "\n") - for remainingSeq in (seqid_to_gene_family.keys() & seq_set): + for remainingSeq in seq_set - seqid_to_gene_family.keys(): partProjFile.write(remainingSeq + "\tcloud\n") # if there is no hit, it's going to be cloud genes. return partition_proj From 2cc133b96c4d1d2793701acd7e9f87996072778d Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 24 Aug 2023 18:04:36 +0200 Subject: [PATCH 075/173] fix clustering treshold, add logging and manage gene id --- ppanggolin/formats/writeFlat.py | 5 +++++ ppanggolin/formats/writeSequences.py | 3 ++- ppanggolin/projection/projection.py | 17 +++++++++++++---- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 93941559..1af8ffa5 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -338,9 +338,14 @@ def write_gexf(output: Path, light: bool = True, compress: bool = False): logging.getLogger("PPanGGOLiN").info(txt) outname = output / f"pangenomeGraph{'_light' if light else ''}.gexf" with write_compressed_or_not(outname, compress) as gexf: + graph_type = 'ligth gexf' if light else 'gexf' + logging.getLogger("PPanGGOLiN").debug(f"Writing the {graph_type} header...") write_gexf_header(gexf, light) + logging.getLogger("PPanGGOLiN").debug(f"Writing the {graph_type} nodes...") write_gexf_nodes(gexf, light) + logging.getLogger("PPanGGOLiN").debug(f"Writing the {graph_type} edges...") write_gexf_edges(gexf, light) + logging.getLogger("PPanGGOLiN").debug(f"Writing the {graph_type} ends...") write_gexf_end(gexf) logging.getLogger("PPanGGOLiN").info(f"Done writing the gexf file : '{outname.as_posix()}'") diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index b0c29cbe..4df464a8 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -37,7 +37,8 @@ def write_gene_sequences_from_annotations(genes_to_write: Pangenome, file_obj: T logging.getLogger("PPanGGOLiN").info("Writing all of the CDS sequences...") for gene in tqdm(genes_to_write, unit="gene", disable=disable_bar): if gene.type == "CDS": - file_obj.write(f'>{add}{gene.ID}\n') + gene_id = gene.ID if gene.local_identifier == "" else gene.local_identifier + file_obj.write(f'>{add}{gene_id}\n') file_obj.write(f'{gene.dna}\n') file_obj.flush() diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 7b1bf280..d256f2de 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -104,6 +104,9 @@ def launch(args: argparse.Namespace): filename=args.annot_file, circular_contigs=[], pseudo=args.use_pseudo) + if input_organism.number_of_genes() == 0: + raise ValueError("The input organism lacks gene annotations. " + f"Please verify the provided annotation file: {args.annot_file}") if not has_sequence: if args.fasta_file: @@ -123,7 +126,9 @@ def launch(args: argparse.Namespace): input_organism = annotate_organism(org_name=args.organism_name, file_name=args.fasta_file, circular_contigs=[], tmpdir=args.tmpdir, code=args.translation_table, norna=annotate_params.norna, kingdom=annotate_params.kingdom, overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure) - + if input_organism.number_of_genes() == 0: + raise ValueError("No genes have been predicted in the input organism's FASTA file, making projection impossible.") + else: raise Exception( "At least one of --fasta_file or --anno_file must be given") @@ -256,6 +261,8 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org seq_fasta_file = output / f"{input_organism.name}.fasta" + logging.info(f'The input organism has {input_organism.number_of_genes()} genes. Writting them in {seq_fasta_file}') + with open(seq_fasta_file, "w") as fh_out_faa: write_gene_sequences_from_annotations( input_organism.genes, fh_out_faa, disable_bar=True) @@ -273,11 +280,13 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org lonely_gene = 0 for gene in input_organism.genes: + gene_id = gene.ID if gene.local_identifier == "" else gene.local_identifier + try: - gene_family = seqid_to_gene_family[gene.ID] + gene_family = seqid_to_gene_family[gene_id] gene_family.add_gene(gene) except KeyError: - new_gene_family = pangenome.add_gene_family(gene.ID) + new_gene_family = pangenome.add_gene_family(gene_id) new_gene_family.add_gene(gene) new_gene_family.add_partition("Cloud") lonely_gene += 1 @@ -735,7 +744,7 @@ def parser_projection(parser: argparse.ArgumentParser): help="DO NOT Realign gene families to link fragments with" "their non-fragmented gene family. (default: False)") - optional.add_argument('--identity', required=False, type=restricted_float, default=0.5, + optional.add_argument('--identity', required=False, type=restricted_float, default=0.8, help="min identity percentage threshold") optional.add_argument('--coverage', required=False, type=restricted_float, default=0.8, From 955f7121d4478457a195a44c8fc8dc4c8ffce656 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 28 Aug 2023 11:30:01 +0200 Subject: [PATCH 076/173] fix default config generation for projection --- ppanggolin/utility/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ppanggolin/utility/utils.py b/ppanggolin/utility/utils.py index ff222f02..41cce5e2 100644 --- a/ppanggolin/utility/utils.py +++ b/ppanggolin/utility/utils.py @@ -151,8 +151,7 @@ def launch_default_config(args: argparse.Namespace): commands = [initial_command] + [sub_cmd for sub_cmd in ALL_WORKFLOW_DEPENDENCIES if sub_cmd in workflow_dependencies] elif initial_command == "projection": - commands = [initial_command] + [sub_cmd for sub_cmd in ALL_WORKFLOW_DEPENDENCIES if - sub_cmd not in ["write", "draw"]] + commands = [initial_command] + ['annotate'] else: commands = [initial_command] From 95497b7281f380999c34bd9af324359c3fee00c1 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 30 Aug 2023 14:29:49 +0200 Subject: [PATCH 077/173] add source information --- ppanggolin/formats/writeFlat.py | 43 +++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 34684f17..bf862404 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -7,7 +7,7 @@ from multiprocessing import get_context from collections import Counter, defaultdict import logging -from typing import TextIO +from typing import TextIO, Dict import pkg_resources from statistics import median, mean, stdev import os @@ -631,33 +631,41 @@ def write_gff(output: str, compress: bool = False): if not os.path.exists(outdir): os.makedirs(outdir) + if pan.parameters["annotation"]["read_annotations_from_file"]: + annotation_sources = {"rRNA": "external", + "tRNA": "external", + "CDS":"external"} + else: + annotation_sources = {} + contig_to_rgp = defaultdict(list) for rgp in pan.regions: contig_to_rgp[rgp.contig].append(rgp) - + rgp_to_spot_id = {rgp:f"spot_{spot.ID}" for spot in pan.spots for rgp in spot.regions} for org in pan.organisms: - print(org) - write_gff_file(org, outdir, compress, contig_to_rgp, rgp_to_spot_id) - break + write_gff_file(org, contig_to_rgp, rgp_to_spot_id, outdir, compress, annotation_sources) + logging.getLogger().info("Done writing the gff files") -def write_gff_file(org, outdir, compress, contig_to_rgp, rgp_to_spotid): - """ - Write the gff file of the provided organism. - :param org: Organism to write the gff - :param output: Path to output directory - :param compress: Compress the file in .gz +def write_gff_file(org: Organism, contig_to_rgp: Dict[Contig, Region], + rgp_to_spotid: Dict[Region, str], outdir: str, compress: bool, + annotation_sources: Dict[str, str]): + """ + Write the GFF file of the provided organism. + + :param org: Organism object for which the GFF file is being written. + :param contig_to_rgp: Dictionary mapping Contig objects to their corresponding Region objects. + :param rgp_to_spotid: Dictionary mapping Region objects to their corresponding spot IDs. + :param outdir: Path to the output directory where the GFF file will be written. + :param compress: If True, compress the output GFF file using .gz format. + :param annotation_sources: A dictionary that maps types of features to their source information. + :type annotation_sources: Dict[str, str] """ - # regions = sorted(pan.regions, key=lambda x: (x.organism.name, x.contig.name, x.start)) - - # TODO: decide if we want to keep source information when parsing - # gff/gbff to be able to reinject it in the gff output - with write_compressed_or_not(outdir + "/" + org.name + ".gff", compress) as outfile: # write gff header outfile.write('##gff-version 3\n') @@ -677,8 +685,7 @@ def write_gff_file(org, outdir, compress, contig_to_rgp, rgp_to_spotid): strand = feature.strand - source = "." - + source = annotation_sources.get(feat_type, "external") attributes = [("ID", feature.ID), ("Name", feature.name), ("product", feature.product), From 692c9bd0d7665adb1bd5f154488f245811b0eecc Mon Sep 17 00:00:00 2001 From: axbazin Date: Sun, 3 Sep 2023 13:34:26 +0200 Subject: [PATCH 078/173] add spot to region attributes --- ppanggolin/region.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/ppanggolin/region.py b/ppanggolin/region.py index aff85efd..2b7b6c3e 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -31,6 +31,7 @@ def __init__(self, region_id: str): self.name = region_id self.score = 0 self.ID = Region.id_counter + self.spot = None Region.id_counter += 1 def __str__(self): @@ -77,10 +78,24 @@ def append(self, gene: Gene): if isinstance(gene, Gene): self.genes.append(gene) - gene.RGP.add(self) + gene.RGP = self else: raise TypeError(f"Unexpected class / type for {type(gene)} when adding it to a RGP") + + def add_spot(self, spot: Spot): + """Sets the spot of the RGP + + :param spot: spot to which the RGP is added + + :raise TypeError: if the given spot is not a Spot. + """ + if isinstance(spot, Spot): + self.spot = spot#only 1 spot possible + else: + raise TypeError(f"Unexpected class / type for {type(spot)} when adding it to a RGP") + + @property def families(self) -> set: """Get the gene families in the RGP @@ -267,6 +282,7 @@ def add_region(self, region): """ if isinstance(region, Region): self.regions.add(region) + region.add_spot(self) def spot_2_families(self): """Add to Gene Families a link to spot""" From f56bf54abe6b8a9ea81a239bbf596483906ea7f4 Mon Sep 17 00:00:00 2001 From: axbazin Date: Sun, 3 Sep 2023 13:35:23 +0200 Subject: [PATCH 079/173] genes can only be in 1 rgp (change type of gene.RGP) --- ppanggolin/genome.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 89424621..78f69a46 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -104,7 +104,7 @@ def __init__(self, gene_id: str): super().__init__(gene_id) self.position = None self.family = None - self.RGP = set() + self.RGP = None self.genetic_code = None self.protein = None From d80acef7267b49b7d39a4b27ddfa9064e54cc84d Mon Sep 17 00:00:00 2001 From: axbazin Date: Sun, 3 Sep 2023 13:36:19 +0200 Subject: [PATCH 080/173] fix #130, writing out only the RGP and spot of the gene with --projection --- ppanggolin/formats/writeFlat.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index aa0078bc..e6918028 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -589,13 +589,13 @@ def write_org_file(org: Organism, output: str, compress: bool = False): len(gene.family.get_genes_per_org(org)), gene.family.named_partition, nb_pers, nb_shell, nb_cloud] if needRegions: - if len(gene.RGP) > 0: - rgp = ','.join([str(region.name) for region in gene.RGP]) + if gene.RGP is not None: + rgp = gene.RGP.name row.append(rgp) - if needSpots: - if len(gene.family.spot) > 0: - spot = ','.join([str(s.ID) for s in gene.family.spot]) - row.append(spot) + if needSpots: + if gene.RGP.spot is not None: + spot = gene.RGP.spot.ID + row.append(spot) if needModules: if len(gene.family.modules) > 0: modules = ','.join(["module_" + str(module.ID) for module in gene.family.modules]) From 1c30f0f0abc212d6b445e43aed4b94102f643d4d Mon Sep 17 00:00:00 2001 From: axbazin Date: Sun, 3 Sep 2023 13:59:41 +0200 Subject: [PATCH 081/173] 'all' now truly draws all spots, and 'synteny' draws all RGP with more than 1 conserved gene order(former 'all') --- ppanggolin/figures/draw_spot.py | 5 ++++- ppanggolin/figures/drawing.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index 5c890c5a..2e65be92 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -644,7 +644,10 @@ def draw_spots(pangenome: Pangenome, output: str, spot_list: str, disable_bar: b need_rgp=True, need_spots=True, need_modules=need_mod, disable_bar=disable_bar) if spot_list == 'all' or any(x == 'all' for x in spot_list): - logging.getLogger().debug(f"all is found in spot list, all spot are drawn.") + logging.getLogger().debug(f"'all' value is found in spot list, all spots are drawn.") + selected_spots = pangenome.spots + elif spot_list == "synteny" or any(x == 'synteny' for x in spot_list): + logging.getLogger().debug(f"'synteny' value is found in spot list, all spots with more than 1 conserved synteny are drawn.") selected_spots = [s for s in pangenome.spots if len(s.get_uniq_ordered_set()) > 1] else: curated_spot_list = {'spot_' + str(s) if not s.startswith("spot_") else str(s) for s in spot_list} diff --git a/ppanggolin/figures/drawing.py b/ppanggolin/figures/drawing.py index 6536cd7b..d26766f8 100644 --- a/ppanggolin/figures/drawing.py +++ b/ppanggolin/figures/drawing.py @@ -90,7 +90,7 @@ def parser_draw(parser: argparse.ArgumentParser): optional.add_argument("--draw_spots", required=False, default=False,action="store_true", help="draw plots for spots of the pangenome") optional.add_argument("--spots", required=False, default='all', nargs='+', - help="a comma-separated list of spots to draw (or 'all' to draw all spots).") + help="a comma-separated list of spots to draw (or 'all' to draw all spots, or 'synteny' to draw spots with different RGP syntenies).") if __name__ == '__main__': From ac770945600fbd1a3f9017795e3047078ea6388e Mon Sep 17 00:00:00 2001 From: axbazin Date: Sun, 3 Sep 2023 14:03:02 +0200 Subject: [PATCH 082/173] check that RGP exists before getting its spot in --projection --- ppanggolin/formats/writeFlat.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index e6918028..e12eee5a 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -592,10 +592,10 @@ def write_org_file(org: Organism, output: str, compress: bool = False): if gene.RGP is not None: rgp = gene.RGP.name row.append(rgp) - if needSpots: - if gene.RGP.spot is not None: - spot = gene.RGP.spot.ID - row.append(spot) + if needSpots: + if gene.RGP is not None and gene.RGP.spot is not None: + spot = gene.RGP.spot.ID + row.append(spot) if needModules: if len(gene.family.modules) > 0: modules = ','.join(["module_" + str(module.ID) for module in gene.family.modules]) From 4dd6e6696a7a2c1d1729c5955fdef4726c415518 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 4 Sep 2023 11:16:51 +0200 Subject: [PATCH 083/173] allow possibility to keep tmp files in align and cluster --- ppanggolin/align/alignOnPang.py | 59 +++++++++++++++++++++------------ ppanggolin/cluster/cluster.py | 49 ++++++++++++++++++--------- 2 files changed, 70 insertions(+), 38 deletions(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 2e231ef8..325066f5 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -10,6 +10,7 @@ from collections import defaultdict from typing import List, Tuple, Set, Dict, IO, Iterator from pathlib import Path +import time # local libraries from ppanggolin.formats import check_pangenome_info @@ -20,7 +21,7 @@ from ppanggolin.figures.draw_spot import draw_selected_spots, subgraph -def createdb(file_obj: TextIOWrapper, tmpdir: Path) -> IO: +def createdb(file_obj: TextIOWrapper, tmpdir: Path, delete_tmp_file: bool = True) -> IO: """ Create a MMseqs2 sequence database with the given fasta file @@ -29,7 +30,7 @@ def createdb(file_obj: TextIOWrapper, tmpdir: Path) -> IO: :return: DB file """ - seqdb = tempfile.NamedTemporaryFile(mode="w", dir=tmpdir) + seqdb = tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False) cmd = ["mmseqs", "createdb", file_obj.name, seqdb.name, '--dbtype', '0'] logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL) @@ -62,7 +63,7 @@ def align_seq_to_pang(pang_file: IO, seq_file: TextIOWrapper, output: Path, if no_defrag: cov_mode = "0" # coverage of query and target - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), prefix="aln_result_db_file") as aln_db: + with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), prefix="aln_result_db_file", delete=False) as aln_db: cmd = ["mmseqs", "search", seq_db.name, pang_db.name, aln_db.name, tmpdir.as_posix(), "-a", "--min-seq-id", str(identity), "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu)] if is_nucleotid: @@ -324,23 +325,25 @@ def get_seq2pang(pangenome: Pangenome, sequence_file: Path, output: Path, tmpdir def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: float = 0.8, coverage: float = 0.8, no_defrag: bool = False, cpu: int = 1, getinfo: bool = False, - draw_related: bool = False, tmpdir: Path = None, disable_bar: bool = False): + draw_related: bool = False, tmpdir: Path = None, disable_bar: bool = False, keep_tmp=False): """ - Main function to align pangenome sequences with fasta file using MMSeqs2 + Aligns pangenome sequences with sequences in a FASTA file using MMSeqs2. - :param pangenome: Pangenome with gene families to align with the given input sequences - :param sequence_file: Path to sequences in a .fasta file to align with the given Pangenome - :param output: Path of the output directory - :param identity: minimal identity threshold for the alignment - :param coverage: minimal coverage threshold for the alignment - :param no_defrag: do not use the defrag workflow if true - :param cpu: number of CPU cores to use - :param getinfo: Extract info related to the best hit of each query, such as the RGP it is in, or the spots. - :param draw_related: Draw figures and graphs in a gexf format of spots associated to the input sequences - :param tmpdir: Temporary directory - :param disable_bar: Disable the progresse bar + :param pangenome: Pangenome containing gene families to align with the input sequences. + :param sequence_file: Path to a FASTA file containing sequences to align with the pangenome. + :param output: Path to the output directory. + :param identity: Minimum identity threshold for the alignment. + :param coverage: Minimum coverage threshold for the alignment. + :param no_defrag: If True, the defrag workflow will not be used. + :param cpu: Number of CPU cores to use. + :param getinfo: If True, extract information related to the best hit of each query, such as the RGP it is in or the spots. + :param draw_related: If True, draw figures and graphs in a gexf format of spots associated with the input sequences. + :param tmpdir: Temporary directory for intermediate files. + :param disable_bar: If True, disable the progress bar. + :param keep_tmp: If True, keep temporary files. """ + tmpdir = Path(tempfile.gettempdir()) if tmpdir is None else tmpdir if pangenome.status["geneFamilySequences"] not in ["inFile", "Loaded", "Computed"]: raise Exception("Cannot use this function as your pangenome does not have gene families representatives " @@ -358,20 +361,31 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo else: check_pangenome_info(pangenome, need_families=True, disable_bar=disable_bar) - # TODO add possibility to keep_tmp - new_tmpdir = tempfile.TemporaryDirectory(dir=tmpdir) - tmp_path = Path(new_tmpdir.name) + if keep_tmp: + + dir_name = 'align_tmpdir' + time.strftime("_%Y-%m-%d_%H.%M.%S",time.localtime()) + tmp_path = Path(tmpdir) / dir_name + mk_outdir(tmp_path, force=True) + logging.getLogger().info(f'Temporary files will be written {tmp_path} and kept for reference.') + + else: + # if keep tmp is false, TemporaryDirectory created and then removed + new_tmpdir = tempfile.TemporaryDirectory(dir=tmpdir) + tmp_path = Path(new_tmpdir.name) + print(tmp_path) + seq_set, align_file, seq2pang = get_seq2pang(pangenome, sequence_file, output, tmp_path, cpu, no_defrag, identity, coverage) if getinfo or draw_related: # TODO Add getinfo to function and remove if get_seq_info(seq2pang, pangenome, output, draw_related, disable_bar=disable_bar) + part_proj = project_and_write_partition(seq2pang, seq_set, output) # write the partition assignation only logging.getLogger().info(f"sequences partition projection : '{part_proj}'") logging.getLogger().info(f"{len(seq2pang)} sequences over {len(seq_set)} have at least one hit in the pangenome.") logging.getLogger().info(f"Blast-tab file of the alignment : '{align_file.name}'") - new_tmpdir.cleanup() + # new_tmpdir.cleanup() def launch(args: argparse.Namespace): @@ -385,7 +399,7 @@ def launch(args: argparse.Namespace): pangenome.add_file(args.pangenome) align(pangenome=pangenome, sequence_file=args.sequences, output=args.output, tmpdir=args.tmpdir, identity=args.identity, coverage=args.coverage, no_defrag=args.no_defrag, cpu=args.cpu, getinfo=args.getinfo, - draw_related=args.draw_related, disable_bar=args.disable_prog_bar) + draw_related=args.draw_related, disable_bar=args.disable_prog_bar, keep_tmp=args.keep_tmp) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -439,7 +453,8 @@ def parser_align(parser: argparse.ArgumentParser): optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") - + optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", + help="Keeping temporary files (useful for debugging).") if __name__ == '__main__': """To test local change and allow using debugger""" diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 1514d4d4..649ca6f1 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -10,6 +10,7 @@ import argparse from typing import TextIO, Tuple, Dict, Set from pathlib import Path +import time # installed libraries from networkx import Graph @@ -22,6 +23,7 @@ from ppanggolin.formats.writeBinaries import write_pangenome, erase_pangenome from ppanggolin.formats.readBinaries import check_pangenome_info, get_gene_sequences_from_file from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations +from ppanggolin.utils import mk_outdir # Global functions @@ -274,24 +276,34 @@ def read_gene2fam(pangenome: Pangenome, gene_to_fam: dict, disable_bar: bool = F def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool = True, code: int = 11, coverage: float = 0.8, identity: float = 0.8, mode: int = 1, force: bool = False, - disable_bar: bool = False): + disable_bar: bool = False, keep_tmp_files: bool = True): """ - Main function to cluster pangenome gene sequences into families + Cluster gene sequences from an annotated pangenome into families. + + :param pangenome: Annotated Pangenome object. + :param tmpdir: Path to a temporary directory for intermediate files. + :param cpu: Number of CPU cores to use for clustering. + :param defrag: Allow removal of fragmented sequences during clustering. + :param code: Genetic code used for sequence translation. + :param coverage: Minimum coverage threshold for sequence alignment during clustering. + :param identity: Minimum identity threshold for sequence alignment during clustering. + :param mode: Clustering mode (MMseqs2 mode). + :param force: Force writing clustering results back to the pangenome. + :param disable_bar: Disable the progress bar during clustering. + :param keep_tmp_files: Keep temporary files (useful for debugging). - :param pangenome: Annoatated Pangenome - :param tmpdir: Path to temporary directory - :param cpu: number of CPU cores to use - :param defrag: Allow to remove fragment - :param code: Genetic code used - :param coverage: minimal coverage threshold for the alignment - :param identity: minimal identity threshold for the alignment - :param mode: MMseqs2 clustering mode - :param force: force to write in the pangenome - :param disable_bar: Allow to disable progress bar """ - newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir) - tmp_path = Path(newtmpdir.name) + if keep_tmp_files: + dir_name = 'clustering_tmpdir' + time.strftime("_%Y-%m-%d_%H.%M.%S",time.localtime()) + tmp_path = Path(tmpdir) / dir_name + mk_outdir(tmp_path, force=True) + else: + newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir) + tmp_path = Path(newtmpdir.name) + + # newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir) + # tmp_path = Path(newtmpdir.name) with open(tmp_path/'nucleotid_sequences', "w") as sequence_file: check_pangenome_for_clustering(pangenome, sequence_file, force, disable_bar=disable_bar) logging.getLogger("PPanGGOLiN").info("Clustering all of the genes sequences...") @@ -306,7 +318,8 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool = aln = align_rep(rep, tmp_path, cpu, coverage, identity) genes2fam, fam2seq = refine_clustering(tsv, aln, fam2seq) pangenome.status["defragmented"] = "Computed" - newtmpdir.cleanup() + if not keep_tmp_files: + newtmpdir.cleanup() read_fam2seq(pangenome, fam2seq) read_gene2fam(pangenome, genes2fam, disable_bar=disable_bar) @@ -316,6 +329,7 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool = pangenome.parameters["cluster"] = {} pangenome.parameters["cluster"]["coverage"] = coverage pangenome.parameters["cluster"]["identity"] = identity + pangenome.parameters["cluster"]["mode"] = mode pangenome.parameters["cluster"]["# defragmentation"] = defrag pangenome.parameters["cluster"]["no_defrag"] = not defrag @@ -441,7 +455,7 @@ def launch(args: argparse.Namespace): "creation. To infer singleton you should give a clustering") clustering(pangenome, args.tmpdir, args.cpu, defrag=not args.no_defrag, code=args.translation_table, coverage=args.coverage, identity=args.identity, mode=args.mode, force=args.force, - disable_bar=args.disable_prog_bar) + disable_bar=args.disable_prog_bar, keep_tmp_files=args.keep_tmp) logging.getLogger("PPanGGOLiN").info("Done with the clustering") else: if None in [args.tmpdir, args.cpu, args.no_defrag, args.translation_table, @@ -500,6 +514,9 @@ def parser_clust(parser: argparse.ArgumentParser): optional = parser.add_argument_group(title="Optional arguments") optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") + optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", + help="Keeping temporary files (useful for debugging).") + if __name__ == '__main__': From a6e97a457a848f01afa3be2693516e9cf3ba3605 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 4 Sep 2023 17:13:06 +0200 Subject: [PATCH 084/173] give possibility to align on all pang genes rather than representative --- ppanggolin/align/alignOnPang.py | 145 +++++++++++++++++++++++----- ppanggolin/projection/projection.py | 31 ++++-- 2 files changed, 141 insertions(+), 35 deletions(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 325066f5..0cd13fef 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -19,7 +19,7 @@ from ppanggolin.pangenome import Pangenome from ppanggolin.region import Spot from ppanggolin.figures.draw_spot import draw_selected_spots, subgraph - +from ppanggolin.formats.readBinaries import get_gene_sequences_from_file def createdb(file_obj: TextIOWrapper, tmpdir: Path, delete_tmp_file: bool = True) -> IO: """ @@ -41,7 +41,7 @@ def align_seq_to_pang(pang_file: IO, seq_file: TextIOWrapper, output: Path, tmpdir: Path, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, is_nucleotid:bool = False, translation_table: int = None) -> Path: """ - Align pangenome sequences against fasta sequence + Align fasta sequence to pangenome sequences. :param pang_file: File with sequences in pangenome :param seq_file: File with sequences from input file @@ -56,22 +56,48 @@ def align_seq_to_pang(pang_file: IO, seq_file: TextIOWrapper, output: Path, :return: Alignement result file """ + translate_first = True + pang_file_is_nt = True + pang_db = createdb(pang_file, tmpdir) seq_db = createdb(seq_file, tmpdir) + if pang_file_is_nt: + logging.getLogger().debug(f"Pangenomes sequences will be translated by mmseqs with translation table {translation_table}") + + pangdb_aa = tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False, prefix="pangenome_seq", suffix=".aa.DB") + + cmd = ["mmseqs", "translatenucs", pang_db.name, pangdb_aa.name, "--translation-table", + f"{translation_table}", "--threads", str(cpu)] + logging.getLogger().debug(" ".join(cmd)) + subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + pang_db = pangdb_aa + + if translate_first and is_nucleotid: + logging.getLogger().debug(f"Input sequences will be translated by mmseqs with translation table {translation_table}") + + seqdb_aa = tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False, prefix="input_seq", suffix=".aa.DB") + + cmd = ["mmseqs", "translatenucs", seq_db.name, seqdb_aa.name, "--translation-table", + f"{translation_table}", "--threads", str(cpu)] + + logging.getLogger().debug(" ".join(cmd)) + subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + seq_db = seqdb_aa + cov_mode = "1" # coverage of target if no_defrag: cov_mode = "0" # coverage of query and target with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), prefix="aln_result_db_file", delete=False) as aln_db: cmd = ["mmseqs", "search", seq_db.name, pang_db.name, aln_db.name, tmpdir.as_posix(), "-a", "--min-seq-id", str(identity), - "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu)] - if is_nucleotid: - logging.getLogger().debug(f"Input sequences will be translated by mmseqs with translation table {translation_table}") + "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu), "--max-seqs", str(1)] + if not translate_first and is_nucleotid: + logging.getLogger().debug(f"Input sequences will be translated by mmseqs search with translation table {translation_table}") cmd += ["--translation-table", f"{translation_table}", "--translate", "0" ] - logging.getLogger().info("Aligning sequences to cluster representatives...") + logging.getLogger().info("Aligning sequences")# to cluster representatives...") logging.getLogger().debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) @@ -89,9 +115,42 @@ def align_seq_to_pang(pang_file: IO, seq_file: TextIOWrapper, output: Path, return outfile.name -def associate_input_seq_to_gene_family_from_aln(aln_res: Path, outdir:Path, pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], str]: +def associate_input_seq_to_gene_family_from_aln_all(aln_res: Path, outdir:Path, pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], str]: """ - Read alignment result to link input sequence to pangenome gene family + Read alignment result to link input sequences to pangenome gene family. + Alignment have been made against all genes of the pangenome. + + :param aln_res: Alignement result file + :param outdir: Output directory + :param pangenome: Input pangenome + + :return: Dictionnary with sequence link to pangenome gene families and actual path to the cleaned alignment file + """ + + seq2pang = {} + result_file = outdir / f"alignment_input_seqs_to_pangenome_gene_families.tsv" # write the actual result file + logging.getLogger(f'Get write alignment file in {result_file}') + + with open(aln_res, "r") as alnFile, open(result_file, "w") as outfile : + for line in alnFile: + line_splitted = line.split() + + line_splitted[1] = line_splitted[1].replace("ppanggolin_", "") # remove the 'ppanggolin_' bit of the id + + outfile.write("\t".join(line_splitted) + "\n") + + input_seq_id, gene_id = line_splitted[0:2] + + if seq2pang.get(input_seq_id) is None: # if no results were found yet + seq2pang[input_seq_id] = pangenome.get_gene(gene_id).family # then the best hit is the first one we see. + + return seq2pang, outfile + + +def associate_input_seq_to_gene_family_from_aln_rep(aln_res: Path, outdir:Path, pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], str]: + """ + Read alignment result to link input sequences to pangenome gene family. + Alignment have been made against representative sequence of gene families of the pangenome. :param aln_res: Alignement result file :param outdir: Output directory @@ -121,7 +180,7 @@ def associate_input_seq_to_gene_family_from_aln(aln_res: Path, outdir:Path, pang def get_seq(seq_file: TextIOWrapper) -> Set[str]: """ - get sequence from sequence input file + get sequence if from sequence input file in fasta format :param seq_file: file containing sequences @@ -147,6 +206,22 @@ def write_gene_fam_sequences(pangenome: Pangenome, file_obj: IO, add: str = ""): file_obj.write(fam.sequence + "\n") # file_obj.flush() +def write_all_gene_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", disable_bar:bool = False): + """ + Export the sequence of pangenome genes + + :param pangenome: Pangenome containing genes + :param file_obj: Temporary file where sequences will be written + :param add: Add prefix to sequence name + """ + gene_ids_to_write = {gene.ID for fam in pangenome.gene_families for gene in fam.genes} + get_gene_sequences_from_file(pangenome.file, file_obj, gene_ids_to_write, + disable_bar=disable_bar) + + # for gene in pangenome.genes: + # file_obj.write(">" + add + gene.ID + "\n") + # file_obj.write(gene.protein + "\n") + # file_obj.flush() def project_and_write_partition(seqid_to_gene_family: Dict[str, GeneFamily], seq_set: Set[str], output: Path) -> Path: """ @@ -293,32 +368,50 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel def get_seq2pang(pangenome: Pangenome, sequence_file: Path, output: Path, tmpdir: Path, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, - coverage: float = 0.8, is_nucleotide:bool = False, translation_table:int = 11) -> Tuple[set, str, dict]: + coverage: float = 0.8, is_nucleotide: bool = False, translation_table: int = 11, + target_type: str = "representative") -> Tuple[set, str, dict]: """ - Assign a pangenome gene family to the input sequences. + Assign gene families from a pangenome to input sequences. - :param pangenome: Pangenome with gene families to align with the given input sequences - :param sequence_file: Path to sequences in a .fasta file to align with the given Pangenome - :param output: Path of the output directory - :param tmpdir: Temporary directory - :param cpu: number of CPU cores to use - :param no_defrag: do not use the defrag workflow if true - :param identity: minimal identity threshold for the alignment - :param coverage: minimal identity threshold for the alignment - :param is_nucleotide: Is the sequence file contains nucleotidic sequences. If True, the sequences are translated by mmseqs - :param translation_table: Translation table to use, if sequences are nucleotide and need to be translated. + This function aligns input sequences to gene families in a pangenome using MMseqs2 and assigns them + to appropriate gene families based on alignment results. + + :param pangenome: Annotated pangenome containing gene families. + :param sequence_file: Path to a FASTA file containing input sequences to align. + :param output: Path to the output directory where alignment results will be stored. + :param tmpdir: Temporary directory for intermediate files. + :param cpu: Number of CPU cores to use for the alignment (default: 1). + :param no_defrag: If True, the defragmentation workflow is skipped (default: False). + :param identity: Minimum identity threshold for the alignment (default: 0.8). + :param coverage: Minimum coverage threshold for the alignment (default: 0.8). + :param is_nucleotide: Set to True if the sequence file contains nucleotide sequences to be translated. + If True, sequences will be translated using the specified translation table (default: False). + :param translation_table: Translation table to use if sequences need to be translated (default: 11). + :param target_type: Type of target sequences to align input sequences with, either 'all' or 'representative' + (default: 'representative'). + + :return: A tuple containing the set of input sequences, the path to the alignment result file, + and a dictionary mapping input sequences to gene families. + :raises ValueError: If the `target_type` is not 'all' or 'representative'. - :return: sequence set, blast-tab result file string, and sequences aligned with families """ with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, suffix=".faa") as tmp_pang_file: - write_gene_fam_sequences(pangenome, tmp_pang_file, add="ppanggolin_") + if target_type == "representative": + write_gene_fam_sequences(pangenome, tmp_pang_file, add="ppanggolin_") + elif target_type == "all": + write_all_gene_sequences(pangenome, tmp_pang_file, add="ppanggolin_") + else: + raise ValueError('Invalid value for target_type. It should be "all" or "representative".') with read_compressed_or_not(sequence_file) as seqFileObj: seq_set = get_seq(seqFileObj) - align_file = align_seq_to_pang(tmp_pang_file, seqFileObj, output, tmpdir, cpu, no_defrag, identity, coverage, is_nucleotide, translation_table ) - - seq2pang, align_file = associate_input_seq_to_gene_family_from_aln(align_file, output, pangenome) + align_file = align_seq_to_pang(tmp_pang_file, seqFileObj, output, tmpdir, cpu, + no_defrag, identity, coverage, is_nucleotide, translation_table ) + if target_type == "representative": + seq2pang, align_file = associate_input_seq_to_gene_family_from_aln_rep(align_file, output, pangenome) + else: + seq2pang, align_file = associate_input_seq_to_gene_family_from_aln_all(align_file, output, pangenome) return seq_set, align_file, seq2pang diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index d256f2de..cd956000 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -102,7 +102,7 @@ def launch(args: argparse.Namespace): # read_annotations(pangenome, args.anno, cpu=args.cpu, pseudo=args.use_pseudo, disable_bar=args.disable_prog_bar) input_organism, has_sequence = read_anno_file(organism_name=args.organism_name, filename=args.annot_file, - circular_contigs=[], + circular_contigs=[], pseudo=args.use_pseudo) if input_organism.number_of_genes() == 0: raise ValueError("The input organism lacks gene annotations. " @@ -138,7 +138,7 @@ def launch(args: argparse.Namespace): singleton_gene_count = annotate_input_genes_with_pangenome_families(pangenome, input_organism=input_organism, output=output_dir, cpu=args.cpu, no_defrag=args.no_defrag, identity=args.identity, coverage=args.coverage, tmpdir=args.tmpdir, - translation_table=args.translation_table) + translation_table=args.translation_table, keep_tmp=True) input_org_rgps, input_org_spots, input_org_modules = None, None, None @@ -241,7 +241,7 @@ def summarize_projection(input_organism:Organism, pangenome:Pangenome, input_org def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_organism: Organism, output: Path, cpu: int, no_defrag: bool, identity: float, coverage: float, tmpdir: Path, - translation_table: int): + translation_table: int, keep_tmp:bool = False): """ Annotate input genes with pangenome gene families and perform clustering. @@ -255,10 +255,13 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org :param tmpdir: Temporary directory for intermediate files. :param disable_bar: Whether to disable progress bar. :param translation_table: Translation table ID for nucleotide sequences. + :param keep_tmp: If True, keep temporary files. :return: Number of genes that do not cluster with any of the gene families of the pangenome. """ + target_type = "all" + seq_fasta_file = output / f"{input_organism.name}.fasta" logging.info(f'The input organism has {input_organism.number_of_genes()} genes. Writting them in {seq_fasta_file}') @@ -266,15 +269,22 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org with open(seq_fasta_file, "w") as fh_out_faa: write_gene_sequences_from_annotations( input_organism.genes, fh_out_faa, disable_bar=True) + + if keep_tmp: + dir_name = 'seq_to_pang_tmpdir' + time.strftime("_%Y-%m-%d_%H.%M.%S",time.localtime()) + "_PID" + str(os.getpid()) + new_tmpdir = tmpdir / dir_name + mk_outdir(new_tmpdir, force=True) - # create tmpdir in case it does not exists - mk_outdir(tmpdir, force=True) + else: + new_tmpdir = tempfile.TemporaryDirectory(dir=tmpdir, prefix="seq_to_pang_tmpdir_") + new_tmpdir = Path(new_tmpdir.name) - with tempfile.TemporaryDirectory(dir=tmpdir, prefix="seq_to_pang_tmpdir_") as new_tmpdir: + seq_set, _, seqid_to_gene_family = get_seq2pang(pangenome, seq_fasta_file, output, new_tmpdir, + cpu, no_defrag, identity=identity, coverage=coverage, + is_nucleotide=True, translation_table=translation_table, target_type="all") - seq_set, _, seqid_to_gene_family = get_seq2pang(pangenome, seq_fasta_file, output, Path(new_tmpdir), - cpu, no_defrag, identity=identity, coverage=coverage, - is_nucleotide=True, translation_table=translation_table) + if not keep_tmp: + new_tmpdir.cleanup() project_and_write_partition(seqid_to_gene_family, seq_set, output) @@ -766,3 +776,6 @@ def parser_projection(parser: argparse.ArgumentParser): optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") + + optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", + help="Keeping temporary files (useful for debugging).") From f183ca731182b416927440d197518508e04248b1 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 4 Sep 2023 18:34:28 +0200 Subject: [PATCH 085/173] improve summary yaml --- ppanggolin/projection/projection.py | 49 ++++++++++++++--------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index cd956000..a1521d9f 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -16,6 +16,7 @@ # installed libraries from tqdm import tqdm import networkx as nx +import yaml # # local libraries from ppanggolin.annotate.synta import annotate_organism, read_fasta, get_dna_sequence @@ -216,28 +217,26 @@ def summarize_projection(input_organism:Organism, pangenome:Pangenome, input_org new_spot_count = "Not computed" if input_org_spots is None else sum(1 for spot in input_org_spots if isinstance(spot, NewSpot)) module_count = "Not computed" if input_org_modules is None else len(input_org_modules) - summary_info = [ - ("Organism name", input_organism.name), - ("Pangenome file", pangenome.file), - ("Contigs", contigs_count), - ("Genes", gene_count), - ("Families", families_count), - ("Singleton families", singleton_gene_count), - ("Persistent families", persistent_family_count), - ("Persistent genes", persistent_gene_count), - ("Shell families", shell_family_count), - ("Shell genes", shell_gene_count), - ("Cloud families", cloud_family_count), - ("Cloud genes", cloud_gene_count), - ("RGPs", rgp_count), - ("Spots", spot_count), - ("New spots", new_spot_count), - ("Modules", module_count) - ] - - summary_str = '\n'.join((f' - {k}: {v}' for k,v in summary_info )) + summary_info = { + "Organism name": input_organism.name, + "Pangenome file": pangenome.file, + "Contigs": contigs_count, + "Genes": gene_count, + "Families": families_count, + "Persistent": {"genes":persistent_gene_count, "families":persistent_family_count}, + "Shell": {"genes":persistent_gene_count, "families":persistent_family_count}, + "Cloud": {"genes":persistent_gene_count, "families":persistent_family_count, "singleton families":singleton_gene_count}, + "RGPs": rgp_count, + "Spots": spot_count, + "New spots": new_spot_count, + "Modules": module_count + } + yaml_string = yaml.dump(summary_info, default_flow_style=False, sort_keys=False) + + + # summary_str = '\n'.join((f' - {k}: {v}' for k,v in summary_info )) print('Projection_summary:') - print(summary_str) + print(yaml_string) def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_organism: Organism, output: Path, cpu: int, no_defrag: bool, identity: float, coverage: float, tmpdir: Path, @@ -747,9 +746,6 @@ def parser_projection(parser: argparse.ArgumentParser): time.localtime()) + "_PID" + str(os.getpid()), help="Output directory") - optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()), - help="directory for storing temporary files") - optional.add_argument('--no_defrag', required=False, action="store_true", help="DO NOT Realign gene families to link fragments with" "their non-fragmented gene family. (default: False)") @@ -776,6 +772,9 @@ def parser_projection(parser: argparse.ArgumentParser): optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") - + + optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()), + help="directory for storing temporary files") + optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", help="Keeping temporary files (useful for debugging).") From f5b51d8c4614f4619eb16b0aee86ad5ff54d85aa Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 5 Sep 2023 13:36:19 +0200 Subject: [PATCH 086/173] make fast option and update tmpdir approach --- ppanggolin/align/alignOnPang.py | 204 ++++++++++++++---------- ppanggolin/context/searchGeneContext.py | 2 +- ppanggolin/projection/projection.py | 52 +++--- ppanggolin/utils.py | 20 +++ 4 files changed, 169 insertions(+), 109 deletions(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 0cd13fef..4d73e1bd 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -21,98 +21,92 @@ from ppanggolin.figures.draw_spot import draw_selected_spots, subgraph from ppanggolin.formats.readBinaries import get_gene_sequences_from_file -def createdb(file_obj: TextIOWrapper, tmpdir: Path, delete_tmp_file: bool = True) -> IO: +def create_mmseqs_db(seq_file: Path, tmpdir: Path, basename="sequences") -> Path: """ Create a MMseqs2 sequence database with the given fasta file - :param file_obj: Fasta file + :param seq_file: Fasta file :param tmpdir: temporary directory :return: DB file """ - seqdb = tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False) - cmd = ["mmseqs", "createdb", file_obj.name, seqdb.name, '--dbtype', '0'] - logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) - subprocess.run(cmd, stdout=subprocess.DEVNULL) - return seqdb + with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False, suffix=".DB", prefix=basename) as seqdb: + cmd = ["mmseqs", "createdb", seq_file.as_posix(), seqdb.name, '--dbtype', '0'] + + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) + subprocess.run(cmd, stdout=subprocess.DEVNULL) + + return Path(seqdb.name) + +def translate_with_mmseqs(seqdb:Path, translation_table:int, cpu:int, tmpdir: Path) -> Path: + """ + """ + with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False, prefix=seqdb.stem, suffix=".aa.DB") as seqdb_aa: + cmd = ["mmseqs", "translatenucs", seqdb.as_posix(), seqdb_aa.name, "--translation-table", + f"{translation_table}", "--threads", str(cpu)] + + logging.getLogger().debug(" ".join(cmd)) + subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + + return Path(seqdb_aa.name) -def align_seq_to_pang(pang_file: IO, seq_file: TextIOWrapper, output: Path, + +def align_seq_to_pang(target_seq_file:Path , query_seq_file: Path, output: Path, tmpdir: Path, cpu: int = 1, no_defrag: bool = False, - identity: float = 0.8, coverage: float = 0.8, is_nucleotid:bool = False, translation_table: int = None) -> Path: + identity: float = 0.8, coverage: float = 0.8, + is_query_nt:bool = False, is_target_nt:bool = False, translation_table: int = None) -> Path: """ Align fasta sequence to pangenome sequences. - :param pang_file: File with sequences in pangenome - :param seq_file: File with sequences from input file + :param target_seq_file: File with sequences of pangenome (target) + :param query_seq_file: File with sequences from input file (query) :param output: Path of the output directory :param tmpdir: Temporary directory to align sequences :param cpu: Number of available cpu - :param no_defrag: Allow to pass the defragmentation step + :param no_defrag: Do not apply defragmentation :param identity: minimal identity threshold for the alignment :param coverage: minimal identity threshold for the alignment - :param is_nucleotid: Is the sequence file are nucleotide sequences. If True, sequences are translated by mmseqs + :param is_query_nt: Is the sequence file (query) are nucleotide sequences. If True, sequences are translated by mmseqs + :param is_target_nt: Is the sequences of pangenome (target) are nucleotide sequences. If True, sequences are translated by mmseqs :param translation_table: Translation table to use, if sequences are nucleotide and need to be translated. :return: Alignement result file """ - translate_first = True - pang_file_is_nt = True - - pang_db = createdb(pang_file, tmpdir) - seq_db = createdb(seq_file, tmpdir) - - if pang_file_is_nt: - logging.getLogger().debug(f"Pangenomes sequences will be translated by mmseqs with translation table {translation_table}") - - pangdb_aa = tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False, prefix="pangenome_seq", suffix=".aa.DB") - cmd = ["mmseqs", "translatenucs", pang_db.name, pangdb_aa.name, "--translation-table", - f"{translation_table}", "--threads", str(cpu)] - logging.getLogger().debug(" ".join(cmd)) - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - pang_db = pangdb_aa + target_db = create_mmseqs_db(target_seq_file, tmpdir, basename="target_sequences") + query_db = create_mmseqs_db(query_seq_file, tmpdir, basename="query_sequences") - if translate_first and is_nucleotid: - logging.getLogger().debug(f"Input sequences will be translated by mmseqs with translation table {translation_table}") + if is_target_nt: + logging.getLogger().debug(f"Target sequences will be translated by mmseqs with translation table {translation_table}") + target_db = translate_with_mmseqs(target_db, translation_table, cpu, tmpdir) - seqdb_aa = tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False, prefix="input_seq", suffix=".aa.DB") + if is_query_nt: + logging.getLogger().debug(f"Query sequences will be translated by mmseqs with translation table {translation_table}") + query_db = translate_with_mmseqs(query_db, translation_table, cpu, tmpdir) - cmd = ["mmseqs", "translatenucs", seq_db.name, seqdb_aa.name, "--translation-table", - f"{translation_table}", "--threads", str(cpu)] - - logging.getLogger().debug(" ".join(cmd)) - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - seq_db = seqdb_aa - - cov_mode = "1" # coverage of target + cov_mode = "2" # coverage of query if no_defrag: cov_mode = "0" # coverage of query and target - - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), prefix="aln_result_db_file", delete=False) as aln_db: - cmd = ["mmseqs", "search", seq_db.name, pang_db.name, aln_db.name, tmpdir.as_posix(), "-a", "--min-seq-id", str(identity), - "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu), "--max-seqs", str(1)] - if not translate_first and is_nucleotid: - logging.getLogger().debug(f"Input sequences will be translated by mmseqs search with translation table {translation_table}") - cmd += ["--translation-table", f"{translation_table}", "--translate", "0" ] + # mmseqs search command + + with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), prefix="aln_result_db_file", suffix=".aln.DB", delete=False) as aln_db: + cmd = ["mmseqs", "search", query_db.as_posix(), target_db.as_posix(), aln_db.name, tmpdir.as_posix(), "-a", "--min-seq-id", str(identity), + "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu)] #, "--max-accept", str(1)] - logging.getLogger().info("Aligning sequences")# to cluster representatives...") + logging.getLogger().info("Aligning sequences") logging.getLogger().debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, prefix="aln_result_db_file", suffix = ".tsv", delete=False) as outfile: - cmd = ["mmseqs", "convertalis", seq_db.name, pang_db.name, aln_db.name, outfile.name, "--format-mode", "2"] + cmd = ["mmseqs", "convertalis", query_db.as_posix(), target_db.as_posix(), aln_db.name, outfile.name, "--format-mode", "2"] logging.getLogger().info("Extracting alignments...") logging.getLogger().debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - - pang_db.close() - seq_db.close() - - return outfile.name + return Path(outfile.name) def associate_input_seq_to_gene_family_from_aln_all(aln_res: Path, outdir:Path, pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], str]: @@ -128,7 +122,7 @@ def associate_input_seq_to_gene_family_from_aln_all(aln_res: Path, outdir:Path, """ seq2pang = {} - result_file = outdir / f"alignment_input_seqs_to_pangenome_gene_families.tsv" # write the actual result file + result_file = outdir / f"alignment_input_seqs_to_all_pangenome_genes.tsv" # write the actual result file logging.getLogger(f'Get write alignment file in {result_file}') with open(aln_res, "r") as alnFile, open(result_file, "w") as outfile : @@ -136,11 +130,12 @@ def associate_input_seq_to_gene_family_from_aln_all(aln_res: Path, outdir:Path, line_splitted = line.split() line_splitted[1] = line_splitted[1].replace("ppanggolin_", "") # remove the 'ppanggolin_' bit of the id - - outfile.write("\t".join(line_splitted) + "\n") + line_splitted[0] = line_splitted[0].replace("ppanggolin_", "") input_seq_id, gene_id = line_splitted[0:2] + outfile.write("\t".join(line_splitted) + "\n") + if seq2pang.get(input_seq_id) is None: # if no results were found yet seq2pang[input_seq_id] = pangenome.get_gene(gene_id).family # then the best hit is the first one we see. @@ -160,13 +155,14 @@ def associate_input_seq_to_gene_family_from_aln_rep(aln_res: Path, outdir:Path, """ seq2pang = {} result_file = outdir / f"alignment_input_seqs_to_pangenome_gene_families.tsv" # write the actual result file - logging.getLogger(f'Get write alignment file in {result_file}') + logging.getLogger().debug(f'Write alignment file in {result_file}') with open(aln_res, "r") as alnFile, open(result_file, "w") as outfile : for line in alnFile: line_splitted = line.split() line_splitted[1] = line_splitted[1].replace("ppanggolin_", "") # remove the 'ppanggolin_' bit of the id + line_splitted[0] = line_splitted[0].replace("ppanggolin_", "") outfile.write("\t".join(line_splitted) + "\n") @@ -178,7 +174,7 @@ def associate_input_seq_to_gene_family_from_aln_rep(aln_res: Path, outdir:Path, return seq2pang, outfile -def get_seq(seq_file: TextIOWrapper) -> Set[str]: +def get_seq_ids(seq_file: TextIOWrapper) -> Set[str]: """ get sequence if from sequence input file in fasta format @@ -215,13 +211,9 @@ def write_all_gene_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", :param add: Add prefix to sequence name """ gene_ids_to_write = {gene.ID for fam in pangenome.gene_families for gene in fam.genes} - get_gene_sequences_from_file(pangenome.file, file_obj, gene_ids_to_write, + # TODO Check that the sequence are in file or loaded and launch appropriate fct accordingly + get_gene_sequences_from_file(pangenome.file, file_obj, gene_ids_to_write, add=add, disable_bar=disable_bar) - - # for gene in pangenome.genes: - # file_obj.write(">" + add + gene.ID + "\n") - # file_obj.write(gene.protein + "\n") - # file_obj.flush() def project_and_write_partition(seqid_to_gene_family: Dict[str, GeneFamily], seq_set: Set[str], output: Path) -> Path: """ @@ -365,11 +357,10 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel logging.getLogger("PPanGGOLiN").info(f"File listing RGP and spots where sequences of interest are located : " f"{output / 'info_input_seq.tsv'}") - -def get_seq2pang(pangenome: Pangenome, sequence_file: Path, output: Path, tmpdir: Path, +def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_file: Path, output: Path, tmpdir: Path, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, - coverage: float = 0.8, is_nucleotide: bool = False, translation_table: int = 11, - target_type: str = "representative") -> Tuple[set, str, dict]: + coverage: float = 0.8, is_nucleotide: bool = False, translation_table: int = 11) -> Tuple[set, str, dict]: + """ Assign gene families from a pangenome to input sequences. @@ -387,31 +378,72 @@ def get_seq2pang(pangenome: Pangenome, sequence_file: Path, output: Path, tmpdir :param is_nucleotide: Set to True if the sequence file contains nucleotide sequences to be translated. If True, sequences will be translated using the specified translation table (default: False). :param translation_table: Translation table to use if sequences need to be translated (default: 11). - :param target_type: Type of target sequences to align input sequences with, either 'all' or 'representative' - (default: 'representative'). - + :return: A tuple containing the set of input sequences, the path to the alignment result file, and a dictionary mapping input sequences to gene families. - :raises ValueError: If the `target_type` is not 'all' or 'representative'. """ + # delete False to be able to keep tmp file. If they are not keep tmpdir will be destroyed so no need to delete tmpfile + with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, + prefix="representative_genes", suffix=".faa") as tmp_pang_file: + + write_gene_fam_sequences(pangenome, tmp_pang_file, add="ppanggolin_") + + with read_compressed_or_not(sequence_file) as seqFileObj: + seqids_set = get_seq_ids(seqFileObj) + + align_file = align_seq_to_pang(target_seq_file=Path(tmp_pang_file.name), query_seq_file=sequence_file, output=output, + tmpdir=tmpdir, cpu=cpu, + no_defrag=no_defrag, identity=identity, coverage=coverage, + is_query_nt=is_nucleotide, is_target_nt=False, + translation_table=translation_table) + + seq2pang, align_file = associate_input_seq_to_gene_family_from_aln_rep(align_file, output, pangenome) + + return seqids_set, align_file, seq2pang + + +def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_file: Path, output: Path, tmpdir: Path, + cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, + is_nucleotide: bool = False, translation_table: int = 11,) -> Tuple[set, str, dict]: + """ + Assign gene families from a pangenome to input sequences. + + This function aligns input sequences to all genes of the pangenome using MMseqs2 and assigns them + to a gene families based on alignment results. + + :param pangenome: Annotated pangenome containing genes. + :param sequence_file: Path to a FASTA file containing input sequences to align. + :param output: Path to the output directory where alignment results will be stored. + :param tmpdir: Temporary directory for intermediate files. + :param cpu: Number of CPU cores to use for the alignment (default: 1). + :param no_defrag: If True, the defragmentation workflow is skipped (default: False). + :param identity: Minimum identity threshold for the alignment (default: 0.8). + :param coverage: Minimum coverage threshold for the alignment (default: 0.8). + :param is_nucleotide: Set to True if the sequence file contains nucleotide sequences to be translated. + If True, sequences will be translated using the specified translation table (default: False). + :param translation_table: Translation table to use if sequences need to be translated (default: 11). + + :return: A tuple containing the set of input sequences, the path to the alignment result file, + and a dictionary mapping input sequences to gene families. + """ - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, suffix=".faa") as tmp_pang_file: - if target_type == "representative": - write_gene_fam_sequences(pangenome, tmp_pang_file, add="ppanggolin_") - elif target_type == "all": - write_all_gene_sequences(pangenome, tmp_pang_file, add="ppanggolin_") - else: - raise ValueError('Invalid value for target_type. It should be "all" or "representative".') + + with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, + prefix="all_pangenome_genes", suffix=".fna") as tmp_pang_file: + + write_all_gene_sequences(pangenome, tmp_pang_file, add="ppanggolin_") with read_compressed_or_not(sequence_file) as seqFileObj: - seq_set = get_seq(seqFileObj) - align_file = align_seq_to_pang(tmp_pang_file, seqFileObj, output, tmpdir, cpu, - no_defrag, identity, coverage, is_nucleotide, translation_table ) - if target_type == "representative": - seq2pang, align_file = associate_input_seq_to_gene_family_from_aln_rep(align_file, output, pangenome) - else: - seq2pang, align_file = associate_input_seq_to_gene_family_from_aln_all(align_file, output, pangenome) + seq_set = get_seq_ids(seqFileObj) + + align_file = align_seq_to_pang(target_seq_file=Path(tmp_pang_file.name), query_seq_file=sequence_file, output=output, + tmpdir=tmpdir, cpu=cpu, + no_defrag=no_defrag, identity=identity, coverage=coverage, + is_query_nt=is_nucleotide, is_target_nt=True, + translation_table=translation_table ) + + seq2pang, align_file = associate_input_seq_to_gene_family_from_aln_all(align_file, output, pangenome) return seq_set, align_file, seq2pang diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index cb869de9..2148e540 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -18,7 +18,7 @@ from ppanggolin.genome import Gene, Contig from ppanggolin.utils import mk_outdir, restricted_float, add_gene, connected_components from ppanggolin.pangenome import Pangenome -from ppanggolin.align.alignOnPang import get_seq2pang, project_and_write_partition +from ppanggolin.align.alignOnPang import project_and_write_partition #get_seq2pang, from ppanggolin.region import GeneContext diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index a1521d9f..cea057cf 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -13,6 +13,7 @@ from collections import defaultdict import csv + # installed libraries from tqdm import tqdm import networkx as nx @@ -24,8 +25,8 @@ from ppanggolin.annotate import subparser as annotate_subparser from ppanggolin.pangenome import Pangenome # from ppanggolin.genome import input_organism, Gene, RNA, Contig -from ppanggolin.utils import read_compressed_or_not, write_compressed_or_not, restricted_float, mk_outdir, get_config_args, parse_config_file, get_default_args -from ppanggolin.align.alignOnPang import get_seq2pang, project_and_write_partition +from ppanggolin.utils import create_tmpdir, read_compressed_or_not, write_compressed_or_not, restricted_float, mk_outdir, get_config_args, parse_config_file, get_default_args +from ppanggolin.align.alignOnPang import get_input_seq_to_family_with_rep,get_input_seq_to_family_with_all, project_and_write_partition from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations from ppanggolin.formats.readBinaries import check_pangenome_info # from ppanggolin.formats import write_pangenome @@ -137,9 +138,11 @@ def launch(args: argparse.Namespace): # Add input organism in pangenome. This is temporary as the pangenome object is not going to be written. pangenome.add_organism(input_organism) - singleton_gene_count = annotate_input_genes_with_pangenome_families(pangenome, input_organism=input_organism, output=output_dir, cpu=args.cpu, - no_defrag=args.no_defrag, identity=args.identity, coverage=args.coverage, tmpdir=args.tmpdir, - translation_table=args.translation_table, keep_tmp=True) + singleton_gene_count = annotate_input_genes_with_pangenome_families(pangenome, input_organism=input_organism, + output=output_dir, cpu=args.cpu,use_representatives=args.fast, + no_defrag=args.no_defrag, identity=args.identity, + coverage=args.coverage, tmpdir=args.tmpdir, + translation_table=args.translation_table, keep_tmp=args.keep_tmp) input_org_rgps, input_org_spots, input_org_modules = None, None, None @@ -238,7 +241,9 @@ def summarize_projection(input_organism:Organism, pangenome:Pangenome, input_org print('Projection_summary:') print(yaml_string) -def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_organism: Organism, output: Path, cpu: int, no_defrag: bool, + +def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_organism: Organism, output: Path, + cpu: int,use_representatives:bool, no_defrag: bool, identity: float, coverage: float, tmpdir: Path, translation_table: int, keep_tmp:bool = False): """ @@ -249,6 +254,7 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org :param output: Output directory for generated files. :param cpu: Number of CPU cores to use. :param no_defrag: Whether to use defragmentation. + :param use_representatives: Use representative sequences of gene families rather than all sequence to align input genes :param identity: Minimum identity threshold for gene clustering. :param coverage: Minimum coverage threshold for gene clustering. :param tmpdir: Temporary directory for intermediate files. @@ -259,31 +265,29 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org :return: Number of genes that do not cluster with any of the gene families of the pangenome. """ - target_type = "all" - seq_fasta_file = output / f"{input_organism.name}.fasta" logging.info(f'The input organism has {input_organism.number_of_genes()} genes. Writting them in {seq_fasta_file}') with open(seq_fasta_file, "w") as fh_out_faa: write_gene_sequences_from_annotations( - input_organism.genes, fh_out_faa, disable_bar=True) + input_organism.genes, fh_out_faa, disable_bar=True, add="ppanggolin_") - if keep_tmp: - dir_name = 'seq_to_pang_tmpdir' + time.strftime("_%Y-%m-%d_%H.%M.%S",time.localtime()) + "_PID" + str(os.getpid()) - new_tmpdir = tmpdir / dir_name - mk_outdir(new_tmpdir, force=True) - - else: - new_tmpdir = tempfile.TemporaryDirectory(dir=tmpdir, prefix="seq_to_pang_tmpdir_") - new_tmpdir = Path(new_tmpdir.name) - seq_set, _, seqid_to_gene_family = get_seq2pang(pangenome, seq_fasta_file, output, new_tmpdir, - cpu, no_defrag, identity=identity, coverage=coverage, - is_nucleotide=True, translation_table=translation_table, target_type="all") - if not keep_tmp: - new_tmpdir.cleanup() + with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: + + + if use_representatives: + seq_set, _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome, seq_fasta_file, output, new_tmpdir, + cpu, no_defrag, identity=identity, coverage=coverage, + is_nucleotide=True, translation_table=translation_table) + else: + seq_set, _, seqid_to_gene_family = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_file=seq_fasta_file, + output=output, tmpdir=new_tmpdir, + cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, + is_nucleotide=True, translation_table=translation_table) + project_and_write_partition(seqid_to_gene_family, seq_set, output) @@ -749,6 +753,10 @@ def parser_projection(parser: argparse.ArgumentParser): optional.add_argument('--no_defrag', required=False, action="store_true", help="DO NOT Realign gene families to link fragments with" "their non-fragmented gene family. (default: False)") + + optional.add_argument("--fast", required=False, action="store_true", + help="Use representative sequences of gene families for input gene alignment. " + "This option is faster but may be less sensitive. By default, all pangenome genes are used.") optional.add_argument('--identity', required=False, type=restricted_float, default=0.8, help="min identity percentage threshold") diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index 5907961f..fa9b3f40 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -10,6 +10,9 @@ from io import TextIOWrapper from pathlib import Path from typing import TextIO, Union, BinaryIO, Tuple, List, Set, Iterable +from contextlib import contextmanager +import tempfile +import time import networkx as nx import pkg_resources @@ -261,6 +264,23 @@ def mk_outdir(output: Path, force: bool = False): raise FileExistsError( f"{output} already exists. Use -f if you want to overwrite the files in the directory") +@contextmanager +def create_tmpdir(main_dir, basename="tmpdir", keep_tmp=False): + + if keep_tmp: + dir_name = basename + time.strftime("_%Y-%m-%d_%H.%M.%S",time.localtime()) + + new_tmpdir = main_dir / dir_name + logging.debug(f'Creating a temporary directory: {new_tmpdir.as_posix()}. This directory will be retained.') + + mk_outdir(new_tmpdir, force=True) + yield new_tmpdir + + else: + with tempfile.TemporaryDirectory(dir=main_dir, prefix=basename) as new_tmpdir: + logging.debug(f"Creating a temporary directory: {new_tmpdir}. This directory won't be retained.") + yield Path(new_tmpdir) + def mk_file_name(basename: str, output: Path, force: bool = False) -> Path: """Returns a usable filename for a ppanggolin output file, or crashes. From 1922048df9c1ec1cc6659301a30425162fbddd2e Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 5 Sep 2023 14:03:51 +0200 Subject: [PATCH 087/173] add docstring --- ppanggolin/align/alignOnPang.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 4d73e1bd..3cf4c988 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -23,13 +23,15 @@ def create_mmseqs_db(seq_file: Path, tmpdir: Path, basename="sequences") -> Path: """ - Create a MMseqs2 sequence database with the given fasta file + Create a MMseqs2 sequence database with the given fasta file. - :param seq_file: Fasta file - :param tmpdir: temporary directory + :param seq_file: Path to the input FASTA file. + :param tmpdir: Path to the temporary directory where the database will be created. + :param basename: Prefix for the database file (default: "sequences"). - :return: DB file + :return: Path to the created MMseqs2 database file. """ + with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False, suffix=".DB", prefix=basename) as seqdb: cmd = ["mmseqs", "createdb", seq_file.as_posix(), seqdb.name, '--dbtype', '0'] @@ -38,9 +40,18 @@ def create_mmseqs_db(seq_file: Path, tmpdir: Path, basename="sequences") -> Path return Path(seqdb.name) -def translate_with_mmseqs(seqdb:Path, translation_table:int, cpu:int, tmpdir: Path) -> Path: +def translate_with_mmseqs(seqdb: Path, translation_table: int, cpu: int, tmpdir: Path) -> Path: """ + Translate nucleotide sequences in an MMseqs2 sequence database to amino acid sequences. + + :param seqdb: Path to the input MMseqs2 sequence database containing nucleotide sequences. + :param translation_table: The translation table to use for conversion. + :param cpu: Number of CPU cores to use for translation. + :param tmpdir: Path to the temporary directory for intermediate files. + + :return: Path to the new MMseqs2 sequence database containing translated amino acid sequences. """ + with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False, prefix=seqdb.stem, suffix=".aa.DB") as seqdb_aa: cmd = ["mmseqs", "translatenucs", seqdb.as_posix(), seqdb_aa.name, "--translation-table", From 61f26fa7957513ded62b3d80ce3abfbea073c630 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 5 Sep 2023 15:36:04 +0200 Subject: [PATCH 088/173] adapt align to fast option and guess seqtype --- ppanggolin/align/alignOnPang.py | 89 +++++++++++++++++------------ ppanggolin/projection/projection.py | 5 +- 2 files changed, 53 insertions(+), 41 deletions(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 3cf4c988..0487b64b 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -7,15 +7,14 @@ import tempfile import subprocess import argparse -from collections import defaultdict +from collections import defaultdict, Counter from typing import List, Tuple, Set, Dict, IO, Iterator from pathlib import Path -import time # local libraries from ppanggolin.formats import check_pangenome_info from ppanggolin.geneFamily import GeneFamily -from ppanggolin.utils import mk_outdir, read_compressed_or_not +from ppanggolin.utils import mk_outdir, read_compressed_or_not, create_tmpdir from ppanggolin.pangenome import Pangenome from ppanggolin.region import Spot from ppanggolin.figures.draw_spot import draw_selected_spots, subgraph @@ -51,7 +50,7 @@ def translate_with_mmseqs(seqdb: Path, translation_table: int, cpu: int, tmpdir: :return: Path to the new MMseqs2 sequence database containing translated amino acid sequences. """ - + with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False, prefix=seqdb.stem, suffix=".aa.DB") as seqdb_aa: cmd = ["mmseqs", "translatenucs", seqdb.as_posix(), seqdb_aa.name, "--translation-table", @@ -185,19 +184,32 @@ def associate_input_seq_to_gene_family_from_aln_rep(aln_res: Path, outdir:Path, return seq2pang, outfile -def get_seq_ids(seq_file: TextIOWrapper) -> Set[str]: + +def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool]: """ - get sequence if from sequence input file in fasta format + Get sequence IDs from a sequence input file in FASTA format and guess the sequence type based on the first sequences. - :param seq_file: file containing sequences + :param seq_file: A file object containing sequences in FASTA format. - :return: set of sequences + :return: A tuple containing a set of sequence IDs and a boolean indicating if the sequences are nucleotide sequences. """ - seqset = set() + dna_expected_char = {'A', 'T', 'G', 'C', 'N'} + seq_set = set() + seq_count = 0 + first_seq_concat = "" + for line in seq_file: if line.startswith(">"): - seqset.add(line[1:].split()[0].strip()) - return seqset + seq_set.add(line[1:].split()[0].strip()) + seq_count += 1 + elif seq_count <= 20: + first_seq_concat += line.strip() + + char_counter = Counter(first_seq_concat) + is_nucleotide = all(char in dna_expected_char for char in char_counter) + + return seq_set, is_nucleotide + def write_gene_fam_sequences(pangenome: Pangenome, file_obj: IO, add: str = ""): @@ -386,8 +398,6 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_file: Path, :param no_defrag: If True, the defragmentation workflow is skipped (default: False). :param identity: Minimum identity threshold for the alignment (default: 0.8). :param coverage: Minimum coverage threshold for the alignment (default: 0.8). - :param is_nucleotide: Set to True if the sequence file contains nucleotide sequences to be translated. - If True, sequences will be translated using the specified translation table (default: False). :param translation_table: Translation table to use if sequences need to be translated (default: 11). :return: A tuple containing the set of input sequences, the path to the alignment result file, @@ -401,7 +411,7 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_file: Path, write_gene_fam_sequences(pangenome, tmp_pang_file, add="ppanggolin_") with read_compressed_or_not(sequence_file) as seqFileObj: - seqids_set = get_seq_ids(seqFileObj) + seqids_set, is_nucleotide = get_seq_ids(seqFileObj) align_file = align_seq_to_pang(target_seq_file=Path(tmp_pang_file.name), query_seq_file=sequence_file, output=output, tmpdir=tmpdir, cpu=cpu, @@ -431,8 +441,6 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_file: Path, :param no_defrag: If True, the defragmentation workflow is skipped (default: False). :param identity: Minimum identity threshold for the alignment (default: 0.8). :param coverage: Minimum coverage threshold for the alignment (default: 0.8). - :param is_nucleotide: Set to True if the sequence file contains nucleotide sequences to be translated. - If True, sequences will be translated using the specified translation table (default: False). :param translation_table: Translation table to use if sequences need to be translated (default: 11). :return: A tuple containing the set of input sequences, the path to the alignment result file, @@ -446,7 +454,7 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_file: Path, write_all_gene_sequences(pangenome, tmp_pang_file, add="ppanggolin_") with read_compressed_or_not(sequence_file) as seqFileObj: - seq_set = get_seq_ids(seqFileObj) + seq_set, is_nucleotide = get_seq_ids(seqFileObj) align_file = align_seq_to_pang(target_seq_file=Path(tmp_pang_file.name), query_seq_file=sequence_file, output=output, tmpdir=tmpdir, cpu=cpu, @@ -461,11 +469,13 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_file: Path, def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: float = 0.8, coverage: float = 0.8, no_defrag: bool = False, cpu: int = 1, getinfo: bool = False, - draw_related: bool = False, tmpdir: Path = None, disable_bar: bool = False, keep_tmp=False): + use_representatives: bool = False, + draw_related: bool = False, translation_table:int=11, tmpdir: Path = None, + disable_bar: bool = False, keep_tmp=False): """ Aligns pangenome sequences with sequences in a FASTA file using MMSeqs2. - :param pangenome: Pangenome containing gene families to align with the input sequences. + :param pangenome: Pangenome object containing gene families to align with the input sequences. :param sequence_file: Path to a FASTA file containing sequences to align with the pangenome. :param output: Path to the output directory. :param identity: Minimum identity threshold for the alignment. @@ -473,7 +483,9 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo :param no_defrag: If True, the defrag workflow will not be used. :param cpu: Number of CPU cores to use. :param getinfo: If True, extract information related to the best hit of each query, such as the RGP it is in or the spots. + :param use_representatives: If True, use representative sequences of gene families rather than all sequences to align input genes. :param draw_related: If True, draw figures and graphs in a gexf format of spots associated with the input sequences. + :param translation_table: Translation table ID for nucleotide sequences. :param tmpdir: Temporary directory for intermediate files. :param disable_bar: If True, disable the progress bar. :param keep_tmp: If True, keep temporary files. @@ -497,21 +509,18 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo else: check_pangenome_info(pangenome, need_families=True, disable_bar=disable_bar) - if keep_tmp: - - dir_name = 'align_tmpdir' + time.strftime("_%Y-%m-%d_%H.%M.%S",time.localtime()) - tmp_path = Path(tmpdir) / dir_name - mk_outdir(tmp_path, force=True) - logging.getLogger().info(f'Temporary files will be written {tmp_path} and kept for reference.') - - else: - # if keep tmp is false, TemporaryDirectory created and then removed - new_tmpdir = tempfile.TemporaryDirectory(dir=tmpdir) - tmp_path = Path(new_tmpdir.name) - print(tmp_path) + with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: + - seq_set, align_file, seq2pang = get_seq2pang(pangenome, sequence_file, output, tmp_path, cpu, no_defrag, identity, - coverage) + if use_representatives: + seq_set, align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir, + cpu, no_defrag, identity=identity, coverage=coverage, + translation_table=translation_table) + else: + seq_set, align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_file=sequence_file, + output=output, tmpdir=new_tmpdir, + cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, + is_nucleotide=True, translation_table=translation_table) if getinfo or draw_related: # TODO Add getinfo to function and remove if get_seq_info(seq2pang, pangenome, output, draw_related, disable_bar=disable_bar) @@ -521,8 +530,6 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo logging.getLogger().info(f"{len(seq2pang)} sequences over {len(seq_set)} have at least one hit in the pangenome.") logging.getLogger().info(f"Blast-tab file of the alignment : '{align_file.name}'") - # new_tmpdir.cleanup() - def launch(args: argparse.Namespace): """ @@ -533,9 +540,12 @@ def launch(args: argparse.Namespace): mk_outdir(args.output, args.force) pangenome = Pangenome() pangenome.add_file(args.pangenome) - align(pangenome=pangenome, sequence_file=args.sequences, output=args.output, tmpdir=args.tmpdir, - identity=args.identity, coverage=args.coverage, no_defrag=args.no_defrag, cpu=args.cpu, getinfo=args.getinfo, - draw_related=args.draw_related, disable_bar=args.disable_prog_bar, keep_tmp=args.keep_tmp) + align(pangenome=pangenome, sequence_file=args.sequences, output=args.output, + tmpdir=args.tmpdir, identity=args.identity, coverage=args.coverage, + no_defrag=args.no_defrag, cpu=args.cpu, getinfo=args.getinfo, + use_representatives=args.fast, draw_related=args.draw_related, + translation_table=args.translation_table, + disable_bar=args.disable_prog_bar, keep_tmp=args.keep_tmp) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -574,6 +584,9 @@ def parser_align(parser: argparse.ArgumentParser): help="min identity percentage threshold") optional.add_argument('--coverage', required=False, type=float, default=0.8, help="min coverage percentage threshold") + optional.add_argument("--fast", required=False, action="store_true", + help="Use representative sequences of gene families for input gene alignment. " + "This option is faster but may be less sensitive. By default, all pangenome genes are used.") optional.add_argument("--translation_table", required=False, default="11", help="Translation table (genetic code) to use.") optional.add_argument("--getinfo", required=False, action="store_true", diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index cea057cf..87fab1d7 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -280,13 +280,12 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org if use_representatives: seq_set, _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome, seq_fasta_file, output, new_tmpdir, - cpu, no_defrag, identity=identity, coverage=coverage, - is_nucleotide=True, translation_table=translation_table) + cpu, no_defrag, identity=identity, coverage=coverage, translation_table=translation_table) else: seq_set, _, seqid_to_gene_family = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_file=seq_fasta_file, output=output, tmpdir=new_tmpdir, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, - is_nucleotide=True, translation_table=translation_table) + translation_table=translation_table) project_and_write_partition(seqid_to_gene_family, seq_set, output) From 142e41d48ae051df15b299c6e1d3505de65384cf Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 5 Sep 2023 17:33:59 +0200 Subject: [PATCH 089/173] adapt context to fast option --- ppanggolin/context/searchGeneContext.py | 56 ++++++++++++++++++------- 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 2148e540..f5161f99 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -16,48 +16,61 @@ # local libraries from ppanggolin.formats import check_pangenome_info from ppanggolin.genome import Gene, Contig -from ppanggolin.utils import mk_outdir, restricted_float, add_gene, connected_components +from ppanggolin.utils import mk_outdir, restricted_float, add_gene, connected_components, create_tmpdir from ppanggolin.pangenome import Pangenome -from ppanggolin.align.alignOnPang import project_and_write_partition #get_seq2pang, +from ppanggolin.align.alignOnPang import project_and_write_partition, get_input_seq_to_family_with_rep, get_input_seq_to_family_with_all from ppanggolin.region import GeneContext -def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: Path, sequences: Path = None, +def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: Path, sequence_file: Path = None, families: Path = None, transitive: int = 4, identity: float = 0.5, - coverage: float = 0.8, jaccard: float = 0.85, no_defrag: bool = False, - cpu: int = 1, disable_bar=True): + coverage: float = 0.8, use_representatives: bool = False, jaccard: float = 0.85, no_defrag: bool = False, + cpu: int = 1, disable_bar=True, translation_table:int=11, keep_tmp:bool = False): """ Main function to search common gene contexts between sequence set and pangenome families :param pangenome: Pangenome containing GeneFamilies to align with sequence set - :param sequences: Path to file containing the sequences + :param sequence_file: Path to file containing the sequences :param families: Path to file containing families name :param output: Path to output directory :param tmpdir: Path to temporary directory :param transitive: number of genes to check on both sides of a family aligned with an input sequence :param identity: minimum identity threshold between sequences and gene families for the alignment :param coverage: minimum coverage threshold between sequences and gene families for the alignment + :param use_representatives: Use representative sequences of gene families rather than all sequences to align input genes :param jaccard: Jaccard index to filter edges in graph :param no_defrag: do not use the defrag workflow if true :param cpu: Number of core used to process :param disable_bar: Allow preventing bar progress print - """ + :param translation_table: Translation table ID for nucleotide sequences. + :param keep_tmp: If True, keep temporary files. + """ # check statuses and load info - if sequences is not None and pangenome.status["geneFamilySequences"] not in ["inFile", "Loaded", "Computed"]: + if sequence_file is not None and pangenome.status["geneFamilySequences"] not in ["inFile", "Loaded", "Computed"]: raise Exception("Cannot use this function as your pangenome does not have gene families representatives " "associated to it. For now this works only if the clustering is realised by PPanGGOLiN.") check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) gene_families = {} fam_2_seq = None - if sequences is not None: + if sequence_file is not None: # Alignment of sequences on pangenome families - new_tmpdir = tempfile.TemporaryDirectory(dir=tmpdir) - seq_set, _, seq2pan = get_seq2pang(pangenome, sequences, output, Path(new_tmpdir.name), cpu, no_defrag, identity, - coverage) + with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: + + if use_representatives: + seq_set, _, seq2pan = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir, + cpu, no_defrag, identity=identity, coverage=coverage, + translation_table=translation_table) + else: + seq_set, _, seq2pan = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_file=sequence_file, + output=output, tmpdir=new_tmpdir, + cpu=cpu, no_defrag=no_defrag, + identity=identity, coverage=coverage, + translation_table=translation_table) + project_and_write_partition(seq2pan, seq_set, output) - new_tmpdir.cleanup() + for k, v in seq2pan.items(): gene_families[v.name] = v fam_2_seq = fam2seq(seq2pan) @@ -243,9 +256,11 @@ def launch(args: argparse.Namespace): pangenome = Pangenome() pangenome.add_file(args.pangenome) search_gene_context_in_pangenome(pangenome=pangenome, output=args.output, tmpdir=args.tmpdir, - sequences=args.sequences, families=args.family, transitive=args.transitive, - identity=args.identity, coverage=args.coverage, jaccard=args.jaccard, - no_defrag=args.no_defrag, cpu=args.cpu, disable_bar=args.disable_prog_bar) + sequence_file=args.sequences, families=args.family, transitive=args.transitive, + identity=args.identity, coverage=args.coverage, + use_representatives=args.fast, jaccard=args.jaccard, + no_defrag=args.no_defrag, cpu=args.cpu, disable_bar=args.disable_prog_bar, + translation_table=args.translation_table, keep_tmp=args.keep_tmp) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -284,10 +299,17 @@ def parser_context(parser: argparse.ArgumentParser): optional.add_argument('--no_defrag', required=False, action="store_true", help="DO NOT Realign gene families to link fragments with" "their non-fragmented gene family.") + optional.add_argument("--fast", required=False, action="store_true", + help="Use representative sequences of gene families for input gene alignment. " + "This option is recommended for faster processing but may be less sensitive. " + "By default, all pangenome genes are used for alignment. " + "This argument makes sense only when --sequence is provided.") optional.add_argument('--identity', required=False, type=float, default=0.5, help="min identity percentage threshold") optional.add_argument('--coverage', required=False, type=float, default=0.8, help="min coverage percentage threshold") + optional.add_argument("--translation_table", required=False, default="11", + help="The translation table (genetic code) to use when the input sequences are nucleotide sequences. ") optional.add_argument("-t", "--transitive", required=False, type=int, default=4, help="Size of the transitive closure used to build the graph. This indicates the number of " "non related genes allowed in-between two related genes. Increasing it will improve " @@ -298,6 +320,8 @@ def parser_context(parser: argparse.ArgumentParser): optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") + optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", + help="Keeping temporary files (useful for debugging).") if __name__ == '__main__': From 5abe86958bfde2fd7cbd3507f765f4ea2145eb52 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 5 Sep 2023 17:36:40 +0200 Subject: [PATCH 090/173] add some checks on gene sequences status --- ppanggolin/align/alignOnPang.py | 65 ++++++++++++++++++---------- ppanggolin/formats/writeSequences.py | 7 ++- ppanggolin/projection/projection.py | 13 +++++- 3 files changed, 58 insertions(+), 27 deletions(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 0487b64b..b60ca633 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -19,6 +19,8 @@ from ppanggolin.region import Spot from ppanggolin.figures.draw_spot import draw_selected_spots, subgraph from ppanggolin.formats.readBinaries import get_gene_sequences_from_file +from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations + def create_mmseqs_db(seq_file: Path, tmpdir: Path, basename="sequences") -> Path: """ @@ -119,7 +121,7 @@ def align_seq_to_pang(target_seq_file:Path , query_seq_file: Path, output: Path, return Path(outfile.name) -def associate_input_seq_to_gene_family_from_aln_all(aln_res: Path, outdir:Path, pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], str]: +def map_input_gene_to_family_all_aln(aln_res: Path, outdir:Path, pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], Path]: """ Read alignment result to link input sequences to pangenome gene family. Alignment have been made against all genes of the pangenome. @@ -132,10 +134,12 @@ def associate_input_seq_to_gene_family_from_aln_all(aln_res: Path, outdir:Path, """ seq2pang = {} - result_file = outdir / f"alignment_input_seqs_to_all_pangenome_genes.tsv" # write the actual result file - logging.getLogger(f'Get write alignment file in {result_file}') + aln_file_clean = outdir / f"alignment_input_seqs_to_all_pangenome_genes.tsv" # write the actual result file + input_seq_to_gene_family = outdir / f"input_seqs_to_gene_family.tsv" + logging.getLogger().debug(f'Writing alignment file in {aln_file_clean}') + logging.getLogger().debug(f'Writing Gene family id to input seq id file in {input_seq_to_gene_family}') - with open(aln_res, "r") as alnFile, open(result_file, "w") as outfile : + with open(aln_res, "r") as alnFile, open(aln_file_clean, "w") as aln_outfl, open(input_seq_to_gene_family, "w") as outgene2fam: for line in alnFile: line_splitted = line.split() @@ -144,15 +148,17 @@ def associate_input_seq_to_gene_family_from_aln_all(aln_res: Path, outdir:Path, input_seq_id, gene_id = line_splitted[0:2] - outfile.write("\t".join(line_splitted) + "\n") + aln_outfl.write("\t".join(line_splitted) + "\n") if seq2pang.get(input_seq_id) is None: # if no results were found yet - seq2pang[input_seq_id] = pangenome.get_gene(gene_id).family # then the best hit is the first one we see. + family = pangenome.get_gene(gene_id).family + seq2pang[input_seq_id] = family # then the best hit is the first one we see. + outgene2fam.write(f"{input_seq_id}\t{family.name}\n") - return seq2pang, outfile + return seq2pang, aln_file_clean -def associate_input_seq_to_gene_family_from_aln_rep(aln_res: Path, outdir:Path, pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], str]: +def map_input_gene_to_family_rep_aln(aln_res: Path, outdir:Path, pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], str]: """ Read alignment result to link input sequences to pangenome gene family. Alignment have been made against representative sequence of gene families of the pangenome. @@ -164,24 +170,29 @@ def associate_input_seq_to_gene_family_from_aln_rep(aln_res: Path, outdir:Path, :return: Dictionnary with sequence link to pangenome gene families and actual path to the cleaned alignment file """ seq2pang = {} - result_file = outdir / f"alignment_input_seqs_to_pangenome_gene_families.tsv" # write the actual result file - logging.getLogger().debug(f'Write alignment file in {result_file}') + aln_file_clean = outdir / f"alignment_input_seqs_to_pangenome_gene_families.tsv" # write the actual result file + + input_seq_to_gene_family = outdir / f"input_seqs_to_gene_family.tsv" + logging.getLogger().debug(f'Writing alignment file in {aln_file_clean}') + logging.getLogger().debug(f'Writing Gene family id to input seq id file in {input_seq_to_gene_family}') - with open(aln_res, "r") as alnFile, open(result_file, "w") as outfile : + with open(aln_res, "r") as alnFile, open(aln_file_clean, "w") as aln_outfl, open(input_seq_to_gene_family, "w") as outgene2fam: for line in alnFile: line_splitted = line.split() line_splitted[1] = line_splitted[1].replace("ppanggolin_", "") # remove the 'ppanggolin_' bit of the id line_splitted[0] = line_splitted[0].replace("ppanggolin_", "") - outfile.write("\t".join(line_splitted) + "\n") + aln_outfl.write("\t".join(line_splitted) + "\n") input_seq_id, gene_family_id = line_splitted[0:2] if seq2pang.get(input_seq_id) is None: # if no results were found yet - seq2pang[input_seq_id] = pangenome.get_gene_family(gene_family_id) # then the best hit is the first one we see. + family = pangenome.get_gene_family(gene_family_id) # then the best hit is the first one we see. + seq2pang[input_seq_id] = family + outgene2fam.write(f"{input_seq_id}\t{family.name}\n") - return seq2pang, outfile + return seq2pang, aln_file_clean @@ -233,10 +244,16 @@ def write_all_gene_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", :param file_obj: Temporary file where sequences will be written :param add: Add prefix to sequence name """ - gene_ids_to_write = {gene.ID for fam in pangenome.gene_families for gene in fam.genes} - # TODO Check that the sequence are in file or loaded and launch appropriate fct accordingly - get_gene_sequences_from_file(pangenome.file, file_obj, gene_ids_to_write, add=add, - disable_bar=disable_bar) + genes_to_write = (gene for fam in pangenome.gene_families for gene in fam.genes) + + if pangenome.status["geneSequences"] == "inFile": + get_gene_sequences_from_file(pangenome.file, file_obj, {gene.ID for gene in genes_to_write}, + disable_bar=disable_bar) + elif pangenome.status["geneSequences"] in ["Computed", "Loaded"]: + write_gene_sequences_from_annotations(genes_to_write, file_obj, disable_bar=disable_bar) + else: + # this should never happen if the pangenome has been properly checked before launching this function. + raise Exception("The pangenome does not include gene sequences") def project_and_write_partition(seqid_to_gene_family: Dict[str, GeneFamily], seq_set: Set[str], output: Path) -> Path: """ @@ -382,7 +399,7 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_file: Path, output: Path, tmpdir: Path, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, - coverage: float = 0.8, is_nucleotide: bool = False, translation_table: int = 11) -> Tuple[set, str, dict]: + coverage: float = 0.8, translation_table: int = 11) -> Tuple[set, str, dict]: """ Assign gene families from a pangenome to input sequences. @@ -419,14 +436,14 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_file: Path, is_query_nt=is_nucleotide, is_target_nt=False, translation_table=translation_table) - seq2pang, align_file = associate_input_seq_to_gene_family_from_aln_rep(align_file, output, pangenome) + seq2pang, align_file = map_input_gene_to_family_rep_aln(align_file, output, pangenome) return seqids_set, align_file, seq2pang def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_file: Path, output: Path, tmpdir: Path, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, - is_nucleotide: bool = False, translation_table: int = 11,) -> Tuple[set, str, dict]: + translation_table: int = 11,) -> Tuple[set, str, dict]: """ Assign gene families from a pangenome to input sequences. @@ -462,7 +479,7 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_file: Path, is_query_nt=is_nucleotide, is_target_nt=True, translation_table=translation_table ) - seq2pang, align_file = associate_input_seq_to_gene_family_from_aln_all(align_file, output, pangenome) + seq2pang, align_file = map_input_gene_to_family_all_aln(align_file, output, pangenome) return seq_set, align_file, seq2pang @@ -520,7 +537,7 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo seq_set, align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_file=sequence_file, output=output, tmpdir=new_tmpdir, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, - is_nucleotide=True, translation_table=translation_table) + translation_table=translation_table) if getinfo or draw_related: # TODO Add getinfo to function and remove if get_seq_info(seq2pang, pangenome, output, draw_related, disable_bar=disable_bar) @@ -528,7 +545,7 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo part_proj = project_and_write_partition(seq2pang, seq_set, output) # write the partition assignation only logging.getLogger().info(f"sequences partition projection : '{part_proj}'") logging.getLogger().info(f"{len(seq2pang)} sequences over {len(seq_set)} have at least one hit in the pangenome.") - logging.getLogger().info(f"Blast-tab file of the alignment : '{align_file.name}'") + logging.getLogger().info(f"Blast-tab file of the alignment : '{align_file}'") def launch(args: argparse.Namespace): diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 4df464a8..eb801c93 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -6,7 +6,7 @@ import logging import re from pathlib import Path -from typing import TextIO, Dict, Set +from typing import TextIO, Dict, Set, Iterable # installed libraries from tqdm import tqdm @@ -14,15 +14,18 @@ # local libraries from ppanggolin.pangenome import Pangenome from ppanggolin.geneFamily import GeneFamily +from ppanggolin.genome import Gene from ppanggolin.utils import write_compressed_or_not, mk_outdir, read_compressed_or_not, restricted_float, detect_filetype from ppanggolin.formats.readBinaries import check_pangenome_info, get_gene_sequences_from_file + module_regex = re.compile(r'^module_[0-9]+') poss_values = ['all', 'persistent', 'shell', 'cloud', 'rgp', 'softcore', 'core', module_regex] poss_values_log = f"Possible values are {', '.join(poss_values[:-1])}, module_X with X being a module id." -def write_gene_sequences_from_annotations(genes_to_write: Pangenome, file_obj: TextIO, add: str = '', + +def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_obj: TextIO, add: str = '', disable_bar: bool = False): """ Writes the CDS sequences to a File object, diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 87fab1d7..aa45dffa 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -87,11 +87,22 @@ def launch(args: argparse.Namespace): "Projection of modules into the provided genome will not be performed.") project_modules = False + + if pangenome.status["geneSequences"] not in ["Loaded", "Computed", "inFile"] and not args.fast: + raise Exception("The provided pangenome has no gene sequences. " + "Projection is still possible with the --fast option to use representative " + "sequences rather than all genes to annotate input genes.") + + if pangenome.status["geneFamilySequences"] not in ["Loaded", "Computed", "inFile"]: + raise Exception("The provided pangenome has no gene families sequences. " + "This is not possible to annotate an input organism to this pangenome.") + check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar, - need_rgp=predict_rgp, need_modules=project_modules, + need_rgp=predict_rgp, need_modules=project_modules, need_gene_sequences=False, need_spots=project_spots) + logging.getLogger().info('Retrieving parameters from the provided pangenome file.') pangenome_params = argparse.Namespace( **{step: argparse.Namespace(**k_v) for step, k_v in pangenome.parameters.items()}) From 06b85fc108e107fd9cae91fdbd1dc15f3cd8f075 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 5 Sep 2023 17:46:28 +0200 Subject: [PATCH 091/173] add potential mmseqs param to try out go faster in all genes startegy --- ppanggolin/align/alignOnPang.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index b60ca633..3c7f3b68 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -105,7 +105,7 @@ def align_seq_to_pang(target_seq_file:Path , query_seq_file: Path, output: Path, with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), prefix="aln_result_db_file", suffix=".aln.DB", delete=False) as aln_db: cmd = ["mmseqs", "search", query_db.as_posix(), target_db.as_posix(), aln_db.name, tmpdir.as_posix(), "-a", "--min-seq-id", str(identity), - "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu)] #, "--max-accept", str(1)] + "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu)] #, "--max-accept", str(1), "--max-seqs", str(10)] logging.getLogger().info("Aligning sequences") logging.getLogger().debug(" ".join(cmd)) From 55cdaba9da1507fa1cdf4a343ecd6ae1e7532db0 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 5 Sep 2023 17:46:34 +0200 Subject: [PATCH 092/173] update github action --- .github/workflows/main.yml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a8652acd..73940b46 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -93,13 +93,14 @@ jobs: shell: bash -l {0} run: | cd testingDataset - ppanggolin align --pangenome mybasicpangenome/pangenome.h5 --sequences some_chlam_proteins.fasta --output test_align --draw_related --getinfo + ppanggolin align --pangenome mybasicpangenome/pangenome.h5 --sequences some_chlam_proteins.fasta \ + --output test_align --draw_related --getinfo --fast cd - - name: testing context command shell: bash -l {0} run: | cd testingDataset - ppanggolin context --pangenome myannopang/pangenome.h5 --sequences some_chlam_proteins.fasta --output test_context + ppanggolin context --pangenome myannopang/pangenome.h5 --sequences some_chlam_proteins.fasta --output test_context --fast ppanggolin context --pangenome readclusterpang/pangenome.h5 --family some_chlam_families.txt --output test_context -f cd - - name: testing metadata command @@ -122,6 +123,10 @@ jobs: shell: bash -l {0} run: | cd testingDataset - ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_gbff --organism_name trotro --annot GBFF/GCF_000026905.1_ASM2690v1_genomic.gbff.gz --spot_graph - ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_fasta --organism_name trotro --fasta_file FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz --spot_graph --graph_formats graphml + ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_gbff \ + --organism_name trotro --annot GBFF/GCF_000026905.1_ASM2690v1_genomic.gbff.gz --spot_graph + ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_fasta \ + --organism_name trotro --fasta_file FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \ + --spot_graph --graph_formats graphml --fast --keep_tmp + \ No newline at end of file From ca7cf854a0c7799b50ed1f38247435ce88b761a6 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 7 Sep 2023 11:41:01 +0200 Subject: [PATCH 093/173] write only non redundant gene sequences in default mode --- ppanggolin/align/alignOnPang.py | 11 +++----- ppanggolin/formats/readBinaries.py | 41 +++++++++++++++++++++++++++--- 2 files changed, 41 insertions(+), 11 deletions(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 3c7f3b68..e11ea330 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -18,8 +18,7 @@ from ppanggolin.pangenome import Pangenome from ppanggolin.region import Spot from ppanggolin.figures.draw_spot import draw_selected_spots, subgraph -from ppanggolin.formats.readBinaries import get_gene_sequences_from_file -from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations +from ppanggolin.formats.readBinaries import get_non_redundant_gene_sequences_from_file def create_mmseqs_db(seq_file: Path, tmpdir: Path, basename="sequences") -> Path: @@ -105,7 +104,7 @@ def align_seq_to_pang(target_seq_file:Path , query_seq_file: Path, output: Path, with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), prefix="aln_result_db_file", suffix=".aln.DB", delete=False) as aln_db: cmd = ["mmseqs", "search", query_db.as_posix(), target_db.as_posix(), aln_db.name, tmpdir.as_posix(), "-a", "--min-seq-id", str(identity), - "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu)] #, "--max-accept", str(1), "--max-seqs", str(10)] + "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu), "--max-accept", str(1)] logging.getLogger().info("Aligning sequences") logging.getLogger().debug(" ".join(cmd)) @@ -244,13 +243,9 @@ def write_all_gene_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", :param file_obj: Temporary file where sequences will be written :param add: Add prefix to sequence name """ - genes_to_write = (gene for fam in pangenome.gene_families for gene in fam.genes) if pangenome.status["geneSequences"] == "inFile": - get_gene_sequences_from_file(pangenome.file, file_obj, {gene.ID for gene in genes_to_write}, - disable_bar=disable_bar) - elif pangenome.status["geneSequences"] in ["Computed", "Loaded"]: - write_gene_sequences_from_annotations(genes_to_write, file_obj, disable_bar=disable_bar) + get_non_redundant_gene_sequences_from_file(pangenome.file, file_obj, disable_bar=disable_bar) else: # this should never happen if the pangenome has been properly checked before launching this function. raise Exception("The pangenome does not include gene sequences") diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 7f5b4c84..e7f831f0 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -196,20 +196,55 @@ def read_sequences(h5f: tables.File) -> dict: seqid2seq[row["seqid"]] = row['dna'].decode() return seqid2seq +def get_non_redundant_gene_sequences_from_file(pangenome_filename: str, file_obj: TextIO, add: str = '', + disable_bar: bool = False): + """ + Writes the non redundant CDS sequences of the Pangenome object to a File object that can be filtered or not by a list of CDS, + and adds the eventual str 'add' in front of the identifiers. Loads the sequences from a .h5 pangenome file. + + :param pangenome_filename: Name of the pangenome file + :param file_obj: Name of the output file + :param add: Add a prefix to sequence header + :param disable_bar: disable progress bar + + """ -def get_gene_sequences_from_file(filename: str, file_obj: TextIO, list_cds: iter = None, add: str = '', + logging.getLogger("PPanGGOLiN").info(f"Extracting and writing non redundant CDS sequences from {pangenome_filename} to {file_obj.name}") + + with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f: + + # get a dictionarry mapping seqid to cds_name + # seqid are uniq and can have multiple cds name. + # We just want one of the cds name to have non redundant fasta sequences + seqid2cds_name = {} + for row in read_chunks(h5f.root.geneSequences, chunk=20000): + # Read the table chunk per chunk otherwise RAM dies on big pangenomes + seqid2cds_name[row["seqid"]] = row["gene"].decode() + + table = h5f.root.sequences + for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): + + cds_name = seqid2cds_name[row["seqid"]] + file_obj.write(f'>{add}{cds_name}\n') + file_obj.write(f'{row["dna"].decode()}\n') + + file_obj.flush() + + +def get_gene_sequences_from_file(pangenome_filename: str, file_obj: TextIO, list_cds: iter = None, add: str = '', disable_bar: bool = False): """ Writes the CDS sequences of the Pangenome object to a File object that can be filtered or not by a list of CDS, and adds the eventual str 'add' in front of the identifiers. Loads the sequences from a .h5 pangenome file. - :param filename: Name of the pangenome file + + :param pangenome_filename: Name of the pangenome file :param file_obj: Name of the output file :param list_cds: An iterable object of CDS :param add: Add a prefix to sequence header :param disable_bar: Prevent to print disable progress bar """ logging.getLogger("PPanGGOLiN").info(f"Extracting and writing CDS sequences from a {filename} file to a fasta file...") - h5f = tables.open_file(filename, "r", driver_core_backing_store=0) + h5f = tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) table = h5f.root.geneSequences list_cds = set(list_cds) if list_cds is not None else None seqid2seq = read_sequences(h5f) From 4029115ab2d5a689c1f45bfefe3e5eedbf24b24d Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 7 Sep 2023 12:12:01 +0200 Subject: [PATCH 094/173] Merge remote-tracking branch 'origin/dev' into projection --- .github/workflows/check_recipes.yml | 2 +- .github/workflows/main.yml | 10 +- VERSION | 2 +- docs/user/Regions-of-Genome-Plasticity.md | 31 + ppanggolin/RGP/__init__.py | 1 + ppanggolin/RGP/genomicIsland.py | 24 +- ppanggolin/RGP/rgp_cluster.py | 625 +++++++++++++++ ppanggolin/RGP/spot.py | 12 +- ppanggolin/__init__.py | 3 +- ppanggolin/align/alignOnPang.py | 2 +- ppanggolin/annotate/annotate.py | 101 ++- ppanggolin/annotate/synta.py | 39 +- ppanggolin/cluster/cluster.py | 30 +- ppanggolin/context/searchGeneContext.py | 12 +- ppanggolin/edge.py | 94 ++- ppanggolin/figures/draw_spot.py | 14 +- ppanggolin/figures/tile_plot.py | 23 +- ppanggolin/figures/ucurve.py | 18 +- ppanggolin/formats/readBinaries.py | 71 +- ppanggolin/formats/writeBinaries.py | 73 +- ppanggolin/formats/writeFlat.py | 145 ++-- ppanggolin/formats/writeMSA.py | 12 +- ppanggolin/formats/writeMetadata.py | 4 +- ppanggolin/formats/writeSequences.py | 8 +- ppanggolin/geneFamily.py | 364 +++++++-- ppanggolin/genome.py | 625 ++++++++++++--- ppanggolin/graph/makeGraph.py | 6 +- ppanggolin/main.py | 8 +- ppanggolin/meta/meta.py | 16 +- ppanggolin/metadata.py | 180 +++-- ppanggolin/metrics/fluidity.py | 6 +- ppanggolin/mod/module.py | 21 +- ppanggolin/nem/partition.py | 7 +- ppanggolin/nem/rarefaction.py | 8 +- ppanggolin/pangenome.py | 526 +++++++------ ppanggolin/projection/projection.py | 30 +- ppanggolin/region.py | 755 +++++++++++++----- ppanggolin/utility/utils.py | 2 +- ppanggolin/workflow/all.py | 9 +- requirements.txt | 2 +- tests/genome/test_Contig.py | 79 -- tests/genome/test_Feature.py | 70 -- tests/genome/test_Gene.py | 57 -- tests/genome/test_Organism.py | 88 --- tests/region/test_Region.py | 203 ----- tests/region/test_rgp_cluster.py | 225 ++++++ tests/test_Edge.py | 134 ---- tests/test_GeneFamily.py | 266 ------- tests/test_Pangenome.py | 360 --------- tests/test_edge.py | 131 ++++ tests/test_genefamily.py | 310 ++++++++ tests/test_genome.py | 580 ++++++++++++++ tests/test_metadata.py | 149 ++++ tests/test_pangenome.py | 891 ++++++++++++++++++++++ tests/test_region.py | 787 +++++++++++++++++++ 55 files changed, 5932 insertions(+), 2319 deletions(-) create mode 100644 ppanggolin/RGP/rgp_cluster.py delete mode 100644 tests/genome/test_Contig.py delete mode 100644 tests/genome/test_Feature.py delete mode 100644 tests/genome/test_Gene.py delete mode 100644 tests/genome/test_Organism.py delete mode 100644 tests/region/test_Region.py create mode 100644 tests/region/test_rgp_cluster.py delete mode 100644 tests/test_Edge.py delete mode 100644 tests/test_GeneFamily.py delete mode 100644 tests/test_Pangenome.py create mode 100644 tests/test_edge.py create mode 100644 tests/test_genefamily.py create mode 100644 tests/test_genome.py create mode 100644 tests/test_metadata.py create mode 100644 tests/test_pangenome.py create mode 100644 tests/test_region.py diff --git a/.github/workflows/check_recipes.yml b/.github/workflows/check_recipes.yml index 286fb866..8fb0bf7f 100644 --- a/.github/workflows/check_recipes.yml +++ b/.github/workflows/check_recipes.yml @@ -18,7 +18,7 @@ jobs: strategy: matrix: os: ['ubuntu-latest','macos-latest'] - python-version: ['3.7','3.8','3.9','3.10'] + python-version: ['3.8','3.9','3.10'] # Steps represent a sequence of tasks that will be executed as part of the job steps: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 73940b46..2b821884 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -17,7 +17,7 @@ jobs: strategy: matrix: os: ['ubuntu-latest', 'macos-latest'] - python-version: ['3.7', '3.8', '3.9', '3.10'] + python-version: ['3.8', '3.9', '3.10'] steps: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - uses: actions/checkout@v2 @@ -89,6 +89,14 @@ jobs: ppanggolin cluster --clusters clusters.tsv -p readclusters/pangenome.h5 ppanggolin msa --pangenome readclusterpang/pangenome.h5 --partition persistent --phylo -o readclusterpang/msa/ -f cd - + - name: testing rgp_cluster command + shell: bash -l {0} + run: | + cd testingDataset + ppanggolin rgp_cluster --pangenome mybasicpangenome/pangenome.h5 + ppanggolin rgp_cluster --pangenome mybasicpangenome/pangenome.h5 --ignore_incomplete_rgp --grr_metric max_grr -f --graph_formats graphml gexf + ppanggolin rgp_cluster --pangenome mybasicpangenome/pangenome.h5 --no_identical_rgp_merging -o rgp_clustering_no_identical_rgp_merging --graph_formats graphml + cd - - name: testing align command shell: bash -l {0} run: | diff --git a/VERSION b/VERSION index e829013f..52cd5461 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.132 +1.2.173 diff --git a/docs/user/Regions-of-Genome-Plasticity.md b/docs/user/Regions-of-Genome-Plasticity.md index f14b11d1..b7bfa771 100644 --- a/docs/user/Regions-of-Genome-Plasticity.md +++ b/docs/user/Regions-of-Genome-Plasticity.md @@ -27,3 +27,34 @@ Spots can be computed once RGPs have been predicted. You can do that using: For versions between 1.1.0 and 1.2.12, you can use additional option '--draw_hotspots' which uses [genoplotR](http://genoplotr.r-forge.r-project.org/) to draw those spots in png figures. For versions above 1.2.12, you can use the dedicated subcommand [draw](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#draw), which uses the python library [bokeh](http://docs.bokeh.org/en/latest/) to draw interactive figures which can be visualized and modified directly in the browser. Information about spots can then be written using `ppanggolin write -p pangenome --spots` which will provide a [file linking RGPs with their spots](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#spots) and a [file showing multiple metrics for each spot](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#summarize-spots) + + + +# RGP cluster based on their gene families + +To cluster RGPs (Regions of Genome Plasticity) based on their gene families, you can use the command `panggolin rgp_cluster`. +The panggolin rgp_cluster command performs the following steps to cluster RGPs (Regions of Genome Plasticity) based on their gene families: + +1. Calculation of GRR (Gene Repertoire Relatedness): The command calculates the GRR values for all pairs of RGPs. The GRR metric evaluates the similarity between two RGPs by assessing their shared gene families. +2. Graph Construction: The command constructs a graph representation of the RGPs, where each RGP is represented as a node in the graph. The edges between the nodes are weighted using the GRR values, indicating the strength of the relationship between the RGPs. +3. Filtering GRR Values: GRR values below the `--grr_cutoff` threshold (default 0.8) are filtered out to remove noise from the analysis. +4. Louvain Communities Clustering: The Louvain communities clustering algorithm is then applied to the graph. This algorithm identifies clusters of RGPs with similar gene family relationships. + +There are three modes available for calculating the GRR value: `min_grr`, `max_grr`, or `incomplete_aware_grr`. +- `min_grr` mode: This mode computes the number of gene families shared between two RGPs and divides it by the smaller number of gene families among the two RGPs. +- `max_grr` mode: In this mode, the number of gene families shared between two RGPs is calculated and divided by the larger number of gene families among the two RGPs. +- `incomplete_aware_grr` (default) mode: If at least one RGP is considered incomplete, which typically happens when it is located at the border of a contig, the `min_grr` mode is used. Otherwise, the `max_grr` mode is applied. This mode is useful to correctly cluster incomplete RGPs. + + +The resulting RGP clusters are stored in a tsv file with the folowing columns: + +| column | description | +|---------|------------------------------| +| RGP | The unique region identifier | +| cluster | The cluster id of the RGP | +| spot_id | the spot ID of the RGP | + + + +The command also generates an RGP graph in the gexf format, which can be utilized to explore the RGP clusters along with their spots of insertion. In this graph identical RGPs with the same family content and with the same spot are merged into a single node to simplify the graph representation. This feature can be disable with the parameter `--no_identical_rgp_merging`. + diff --git a/ppanggolin/RGP/__init__.py b/ppanggolin/RGP/__init__.py index cd925093..61883771 100644 --- a/ppanggolin/RGP/__init__.py +++ b/ppanggolin/RGP/__init__.py @@ -1,2 +1,3 @@ from .genomicIsland import subparser, launch from .spot import * +from . import rgp_cluster \ No newline at end of file diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index 2bba4591..4f5cfc0e 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -5,6 +5,7 @@ import logging import argparse from pathlib import Path +from typing import Set # installed libraries from tqdm import tqdm @@ -30,7 +31,7 @@ def changes(self, score): self.score = score if score >= 0 else 0 -def extract_rgp(contig, node, rgp_id, naming): +def extract_rgp(contig, node, rgp_id, naming) -> Region: """ Extract the region from the given starting node """ @@ -40,7 +41,7 @@ def extract_rgp(contig, node, rgp_id, naming): elif naming == "organism": new_region = Region(node.gene.organism.name + "_" + contig.name + "_RGP_" + str(rgp_id)) while node.state: - new_region.append(node.gene) + new_region.add(node.gene) node.state = 0 node.score = 0 node = node.prev @@ -148,7 +149,7 @@ def init_matrices(contig: Contig, multi: set, persistent_penalty: int = 3, varia def mk_regions(contig: Contig, matrix: list, multi: set, min_length: int = 3000, min_score: int = 4, - persistent: int = 3, continuity: int = 1, naming: str = "contig") -> set: + persistent: int = 3, continuity: int = 1, naming: str = "contig") -> Set[Region]: """ Processing matrix and 'emptying' it to get the regions. @@ -183,7 +184,7 @@ def max_index_node(lst): while val >= min_score: new_region = extract_rgp(contig, matrix[index], len(contig_regions), naming) new_region.score = val - if (new_region[0].stop - new_region[-1].start) > min_length: + if new_region.length > min_length: contig_regions.add(new_region) rewrite_matrix(contig, matrix, index, persistent, continuity, multi) val, index = max_index_node(matrix) @@ -208,8 +209,8 @@ def compute_org_rgp( organism: Organism, multigenics: set, :return: A set of RGPs of the provided organism. """ org_regions = set() - for contig in tqdm(organism.contigs, total=len(organism.contigs), unit="contig", disable=disable_bar): - if len(contig.genes) != 0: # some contigs have no coding genes... + for contig in tqdm(organism.contigs, total=organism.number_of_contigs, unit="contig", disable=disable_bar): + if contig.number_of_genes != 0: # some contigs have no coding genes... # can definitely multiprocess this part, as not THAT much information is needed... matrix = init_matrices(contig, multigenics, persistent_penalty, variable_gain) org_regions |= mk_regions( @@ -276,16 +277,17 @@ def predict_rgp(pangenome: Pangenome, persistent_penalty: int = 3, variable_gain # check statuses and load info check_pangenome_former_rgp(pangenome, force) check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=False, need_partitions=True, - disable_bar=disable_bar) + disable_bar=disable_bar) logging.getLogger("PPanGGOLiN").info("Detecting multigenic families...") multigenics = pangenome.get_multigenics(dup_margin) logging.getLogger("PPanGGOLiN").info("Compute Regions of Genomic Plasticity ...") name_scheme = naming_scheme(pangenome) - for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms(), unit="genomes", disable=disable_bar): - pangenome.add_regions(compute_org_rgp(org, multigenics, persistent_penalty, variable_gain, min_length, - min_score, naming=name_scheme)) - logging.getLogger("PPanGGOLiN").info(f"Predicted {len(pangenome.regions)} RGP") + for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genomes", disable=disable_bar): + for region in compute_org_rgp(org, multigenics, persistent_penalty, variable_gain, min_length, + min_score, naming=name_scheme): + pangenome.add_region(region) + logging.getLogger("PPanGGOLiN").info(f"Predicted {pangenome.number_of_rgp} RGP") # save parameters and save status pangenome.parameters["rgp"] = {} diff --git a/ppanggolin/RGP/rgp_cluster.py b/ppanggolin/RGP/rgp_cluster.py new file mode 100644 index 00000000..63b9f2a8 --- /dev/null +++ b/ppanggolin/RGP/rgp_cluster.py @@ -0,0 +1,625 @@ +#!/usr/bin/env python3 +# coding:utf-8 + +# default libraries +import logging +import argparse +import os +from itertools import combinations +from collections.abc import Callable +from collections import defaultdict +from typing import Dict, List, Tuple, Set, Union, Any +from pathlib import Path + +# installed libraries +from tqdm import tqdm +import networkx as nx +import pandas as pd + +# local libraries +from ppanggolin.pangenome import Pangenome +from ppanggolin.region import Region +from ppanggolin.formats import check_pangenome_info +from ppanggolin.utils import restricted_float, mk_outdir +from ppanggolin.geneFamily import GeneFamily + + +class IdenticalRegions: + """ + Represents a group of Identical Regions within a pangenome. + + :param name: The name of the identical region group. + :param identical_rgps: A set of Region objects representing the identical regions. + :param families: A set of GeneFamily objects associated with the identical regions. + :param is_contig_border: A boolean indicating if the identical regions span across contig borders. + """ + + def __init__(self, name: str, identical_rgps: Set[Region], families: Set[GeneFamily], is_contig_border: bool): + if not isinstance(identical_rgps, set): + raise TypeError("Expected 'identical_rgps' to be a set") + else: + if len(identical_rgps) == 0: + raise ValueError("Set of identical_rgps must not be empty") + if not all(isinstance(region, Region) for region in identical_rgps): + raise TypeError("All element in identical_rgps must be `Region`") + if not isinstance(families, set): + raise TypeError("Expected 'families' to be a set") + else: + if len(families) == 0: + raise ValueError("Set of families must not be empty") + if not all(isinstance(family, GeneFamily) for family in families): + raise TypeError("All element in families must be `GeneFamilies`") + self.name = name + self.families = families + self.rgps = identical_rgps + self.is_contig_border = is_contig_border + self.ID = Region.id_counter + Region.id_counter += 1 + + def __eq__(self, other: 'IdenticalRegions') -> bool: + """ + Check if two IdenticalRegions objects are equal based on their families, identical regions, and contig border status. + + :param other: The IdenticalRegions object to compare. + :return: True if the objects are equal, False otherwise. + """ + if not isinstance(other, IdenticalRegions): + # don't attempt to compare against unrelated types + raise TypeError("'IdenticalRegions' type object was expected, " + f"but '{type(other)}' type object was provided.") + + return self.families == other.families and self.rgps == other.rgps and self.is_contig_border == other.is_contig_border + + def __repr__(self): + return f"IdenticalRegions(name='{self.name}', num_rgps={len(self.rgps)}, num_families={len(self.families)}, is_contig_border={self.is_contig_border})" + + def __str__(self): + return self.name + + def __hash__(self): + return id(self) + + def __lt__(self, obj): + return self.ID < obj.ID + + def __gt__(self, obj): + return self.ID > obj.ID + + def __le__(self, obj): + return self.ID <= obj.ID + + def __ge__(self, obj): + return self.ID >= obj.ID + + +def compute_grr(rgp_a_families: Set[GeneFamily], rgp_b_families: Set[GeneFamily], mode: Callable) -> float: + """ + Compute gene repertoire relatedness (GRR) between two rgp. + Mode can be the function min to compute min GRR or max to compute max_grr + + :param rgp_a_families: Rgp A + :param rgp_b_families: rgp B + :param mode: min or max function + + :return: GRR value between 0 and 1 + """ + + grr = len((rgp_a_families & rgp_b_families)) / mode(len(rgp_a_families), len(rgp_b_families)) + + return grr + + +def compute_jaccard_index(rgp_a_families: set, rgp_b_families: set) -> float: + """ + Compute jaccard index between two rgp based on their famillies. + + :param rgp_a_families: Rgp A + :param rgp_b_families: rgp B + + :return : Jaccard index + """ + + jaccard_index = len((rgp_a_families & rgp_b_families)) / len(rgp_a_families | rgp_b_families) + + return jaccard_index + + +def add_info_to_rgp_nodes(graph, regions: List[Region], region_to_spot: dict): + """ + Format RGP information into a dictionary for adding to the graph. + + This function takes a list of RGPs and a dictionary mapping each RGP to its corresponding spot ID, + and formats the RGP information into a dictionary for further processing or addition to a graph. + + :param graph: RGPs graph + :param regions: A list of RGPs. + :param region_to_spot: A dictionary mapping each RGP to its corresponding spot ID. + :return: A dictionary with RGP id as the key and a dictionary containing information on the corresponding RGP as value. + """ + + region_attributes = {} + for region in regions: + region_info = {"contig": region.contig.name, + 'organism': region.organism.name, + "name": region.name, + "genes_count": len(region), + "is_contig_border": region.is_contig_border, + "is_whole_contig": region.is_whole_contig, + "spot_id": get_spot_id(region, region_to_spot), + 'families_count': region.number_of_families} + + region_attributes[region.ID] = region_info + + node_attributes = graph.nodes[region.ID] + node_attributes.update(region_info) + + return region_attributes + + +def join_dicts(dicts: List[Dict[str, Any]], delimiter: str = ';') -> Dict[str, Any]: + """ + Join dictionaries by concatenating the values with a custom delimiter for common keys. + + Given a list of dictionaries, this function creates a new dictionary where the values for common keys + are concatenated with the specified delimiter. + + :param dicts: A list of dictionaries to be joined. + :param delimiter: The delimiter to use for joining values. Default is ';'. + :return: A dictionary with joined values for common keys. + """ + final_dict = defaultdict(list) + for dict_obj in dicts: + for k, v in dict_obj.items(): + final_dict[k].append(str(v)) + return {k: delimiter.join(v) for k, v in final_dict.items()} + + +def format_rgp_metadata(rgp: Region) -> Dict[str, str]: + """ + Format RGP metadata by combining source and field values. + + Given an RGP object with metadata, this function creates a new dictionary where the keys + are formatted as 'source_field' and the values are concatenated with '|' as the delimiter. + + :param rgp: The RGP object with metadata. + :return: A dictionary with formatted metadata. + """ + source_field_2_value = defaultdict(list) + for rgp_metadata in rgp.metadata: + source = rgp_metadata.source + for field in rgp_metadata.fields: + source_field_2_value[f"{source}_{field}"].append(str(rgp_metadata.get(field))) + + return {col_name: '|'.join(values) for col_name, values in source_field_2_value.items()} + + +def add_rgp_metadata_to_graph(graph: nx.Graph, rgps: Set[Union[Region, IdenticalRegions]]) -> None: + """ + Add metadata from Region or IdenticalRegions objects to the graph. + + :param graph: The graph to which the metadata will be added. + :param rgps: A set of Region or IdenticalRegions objects containing the metadata to be added. + + """ + for rgp in rgps: + if isinstance(rgp, Region): + rgp_metadata = format_rgp_metadata(rgp) + elif isinstance(rgp, IdenticalRegions): + rgp_metadata_dicts = [format_rgp_metadata(ident_rgp) for ident_rgp in rgp.rgps] + rgp_metadata = join_dicts(rgp_metadata_dicts) + else: + raise TypeError(f'Expect Region or IdenticalRegions object, not {type(rgp)}') + + for metadata_name, value in rgp_metadata.items(): + graph.nodes[rgp.ID][metadata_name] = value + + +def add_info_to_identical_rgps(rgp_graph: nx.Graph, identical_rgps_objects: List[IdenticalRegions], + rgp_to_spot: Dict[Region, int]): + """ + Add identical rgps info in the graph as node attributes. + + :params rgp_graph: Graph with rgp id as node and grr value as edges + :params rgp_to_identical_rgps: dict with uniq RGP as the key and set of identical rgps as value + """ + + for identical_rgp_obj in identical_rgps_objects: + spots_of_identical_rgp_obj = {get_spot_id(i_rgp, rgp_to_spot) for i_rgp in identical_rgp_obj.rgps} + + rgp_graph.add_node(identical_rgp_obj.ID, + identical_rgp_group=True, + name=identical_rgp_obj.name, + families_count=len(identical_rgp_obj.families), + identical_rgp_count=len(identical_rgp_obj.rgps), + identical_rgp_names=';'.join([i_rgp.name for i_rgp in identical_rgp_obj.rgps]), + identical_rgp_organisms=';'.join({i_rgp.organism.name for i_rgp in identical_rgp_obj.rgps}), + identical_rgp_contig_border_count=len( + [True for i_rgp in identical_rgp_obj.rgps if i_rgp.is_contig_border]), + identical_rgp_whole_contig_count=len( + [True for i_rgp in identical_rgp_obj.rgps if i_rgp.is_whole_contig]), + identical_rgp_spots=";".join(spots_of_identical_rgp_obj), + spot_id=spots_of_identical_rgp_obj.pop() if len( + spots_of_identical_rgp_obj) == 1 else "Mulitple spots" + ) + + +def add_edges_to_identical_rgps(rgp_graph: nx.Graph, identical_rgps_objects: List[IdenticalRegions]): + """ + Replace identical rgp objects by all identical RGPs it contains. + + :param rgp_graph: The RGP graph to add edges to. + :param identical_rgps_objects: A dictionary mapping RGPs to sets of identical RGPs. + """ + + identical_edge_data = {'grr': 1.0, 'max_grr': 1.0, + 'min_grr': 1.0, + "identical_famillies": True} + + added_identical_rgps = [] + + for identical_rgp_obj in identical_rgps_objects: + + rgp_graph.add_nodes_from([ident_rgp.ID for ident_rgp in identical_rgp_obj.rgps], + identical_rgp_group=identical_rgp_obj.name) + + # add edge between identical rgp with metrics at one (perfect score) + edges_to_add = [(rgp_a.ID, rgp_b.ID, identical_edge_data) + for rgp_a, rgp_b in combinations(identical_rgp_obj.rgps, 2)] + + # replicate all edges that connect identical rgp object to other rgps + for connected_rgp in rgp_graph.neighbors(identical_rgp_obj.ID): + edge_data = rgp_graph[identical_rgp_obj.ID][connected_rgp] + edges_to_add += [(identical_rgp.ID, connected_rgp, edge_data) + for identical_rgp in identical_rgp_obj.rgps] + + rgp_graph.add_edges_from(edges_to_add) + + # remove node of the identical rgp object + rgp_graph.remove_node(identical_rgp_obj.ID) + + added_identical_rgps += list(identical_rgp_obj.rgps) + + return added_identical_rgps + + +def dereplicate_rgp(rgps: Set[Union[Region, IdenticalRegions]], + disable_bar: bool = False) -> List[Union[Region, IdenticalRegions]]: + """ + Dereplicate RGPs that have the same families. + + Given a list of Region or IdenticalRegions objects representing RGPs, this function groups together + RGPs with the same families into IdenticalRegions objects and returns a list of dereplicated RGPs. + + :param rgps: A set of Region or IdenticalRegions objects representing the RGPs to be dereplicated. + :param disable_bar: If True, disable the progress bar. + + :return: A list of dereplicated RGPs (Region or IdenticalRegions objects). For RGPs with the same families, + they will be grouped together in IdenticalRegions objects. + """ + logging.info(f'Dereplicating {len(rgps)} RGPs') + families_to_rgps = defaultdict(list) + + for rgp in tqdm(rgps, total=len(rgps), unit="RGP", disable=disable_bar): + families_to_rgps[tuple(sorted((f.ID for f in rgp.families)))].append(rgp) + + dereplicated_rgps = [] + identical_region_count = 0 + for rgps in families_to_rgps.values(): + if len(rgps) == 1: + dereplicated_rgps.append(rgps[0]) + else: + families = set(rgps[0].families) + + # identical regions object is considered on a contig border if all rgp are contig border + is_contig_border = all([rgp.is_contig_border for rgp in rgps]) + + # create a new object that will represent the identical rgps + identical_rgp = IdenticalRegions(name=f"identical_rgps_{identical_region_count}", + identical_rgps=set(rgps), + families=families, + is_contig_border=is_contig_border) + identical_region_count += 1 + dereplicated_rgps.append(identical_rgp) + + logging.info(f'{len(dereplicated_rgps)} unique RGPs') + return dereplicated_rgps + + +def compute_rgp_metric(rgp_a: Region, + rgp_b: Region, + grr_cutoff: float, + grr_metric: str) -> Union[Tuple[int, int, dict], None]: + """ + Compute GRR metric between two RGPs. + + :param rgp_a: A rgp + :param rgp_b: another rgp + :param grr_cutoff: Cutoff filter + :param grr_metric: grr mode between min_grr, max_grr and incomplete_aware_grr + + :returns: Tuple containing the IDs of the two RGPs and the computed metrics as a dictionary + """ + + edge_metrics = {} + + # RGP at a contig border are seen as incomplete and min GRR is used instead of max GRR + if rgp_a.is_contig_border or rgp_b.is_contig_border: + edge_metrics["incomplete_aware_grr"] = compute_grr(set(rgp_a.families), set(rgp_b.families), min) + else: + edge_metrics["incomplete_aware_grr"] = compute_grr(set(rgp_a.families), set(rgp_b.families), max) + + # Compute max and min GRR metrics + edge_metrics['max_grr'] = compute_grr(set(rgp_a.families), set(rgp_b.families), max) + edge_metrics['min_grr'] = compute_grr(set(rgp_a.families), set(rgp_b.families), min) + + # The number of shared families can be useful when visualizing the graph + edge_metrics['shared_family'] = len(set(rgp_a.families).intersection(set(rgp_b.families))) + + # Only return the metrics if the GRR value is above the cutoff + if edge_metrics[grr_metric] >= grr_cutoff: + return rgp_a.ID, rgp_b.ID, edge_metrics + + +def cluster_rgp_on_grr(graph: nx.Graph, clustering_attribute: str = "grr"): + """ + Cluster rgp based on grr using louvain communities clustering. + + :param graph: NetworkX graph object representing the RGPs and their relationship + :param clustering_attribute: Attribute of the graph to use for clustering (default is "grr") + """ + + partitions = nx.algorithms.community.louvain_communities( + graph, weight=clustering_attribute) + + # Add partition index in node attributes + for i, cluster_nodes in enumerate(partitions): + nx.set_node_attributes( + graph, {node: f"cluster_{i}" for node in cluster_nodes}, name=f"{clustering_attribute}_cluster") + + logging.info( + f"Graph has {len(partitions)} clusters using {clustering_attribute}") + + +def get_spot_id(rgp: Region, rgp_to_spot: Dict[Region, int]) -> str: + """ + Return Spot ID associated to an RGP. + It adds the prefix "spot_" to the spot ID. + When no spot is associated with the RGP, then the string "No spot" is return + + :params rgp: RGP id + :params rgp_to_spot: A dictionary mapping an RGP to its spot. + + :return: Spot ID of the given RGP with the prefix spot_ or "No spot". + """ + if rgp in rgp_to_spot: + return f"spot_{rgp_to_spot[rgp]}" + else: + return "No spot" + + +def write_rgp_cluster_table(outfile: str, grr_graph: nx.Graph, + rgps_in_graph: List[Union[Region, IdenticalRegions]], + grr_metric: str, + rgp_to_spot: Dict[Region, int]) -> None: + """ + Writes RGP cluster info to a TSV file using pandas. + + :param outfile: Name of the tsv file + :param grr_graph: The GRR graph. + :param rgps_in_graph: A dictionary mapping an RGP to a set of identical RGPs. + :param grr_metric: The GRR metric used for clustering. + :param rgp_to_spot: A dictionary mapping an RGP to its spot. + :return: None + """ + + all_rgps_infos = [] + for rgp_in_graph in rgps_in_graph: + cluster = grr_graph.nodes[rgp_in_graph.ID][f'{grr_metric}_cluster'] + + identical_rgps = [rgp_in_graph] if isinstance(rgp_in_graph, Region) else rgp_in_graph.rgps + + all_rgps_infos += [{"RGPs": r.name, "cluster": cluster, + "spot_id": get_spot_id(r, rgp_to_spot)} for r in identical_rgps] + + df = pd.DataFrame(all_rgps_infos) + df.to_csv(outfile, sep='\t', index=False) + + +def cluster_rgp(pangenome, grr_cutoff: float, output: str, basename: str, + ignore_incomplete_rgp: bool, unmerge_identical_rgps: bool, grr_metric: str, + disable_bar: bool, graph_formats: Set[str]): + """ + Main function to cluster regions of genomic plasticity based on their GRR + + :param pangenome: pangenome object + :param grr_cutoff: GRR cutoff value for clustering + :param output: Directory where the output files will be saved + :param basename: Basename for the output files + :param ignore_incomplete_rgp: Whether to ignore incomplete RGPs located at a contig border + :param unmerge_identical_rgps: Whether to unmerge identical RGPs into separate nodes in the graph + :param grr_metric: GRR metric to use for clustering + :param disable_bar: Whether to disable the progress bar + :param graph_formats: Set of graph file formats to save the output + """ + + if pangenome.status["metadata"]["RGPs"] == "inFile": + need_metadata = True + logging.info('Some RGPs metadata have been found in pangenome, they will be included in rgp graph.') + else: + need_metadata = False + + # check statuses and load info + check_pangenome_info(pangenome, need_families=True, need_annotations=True, + disable_bar=disable_bar, need_rgp=True, need_spots=True, need_metadata=need_metadata, + metatype="RGPs") + + if pangenome.regions == 0: + raise Exception( + "The pangenome has no RGPs. The clustering of RGP is then not possible.") + + # add all rgp as node + if ignore_incomplete_rgp: + valid_rgps = [ + rgp for rgp in pangenome.regions if not rgp.is_contig_border] + + ignored_rgp_count = pangenome.number_of_rgp - len(valid_rgps) + total_rgp_count = pangenome.number_of_rgp + + logging.info( + f'Ignoring {ignored_rgp_count}/{total_rgp_count} ({100 * ignored_rgp_count / total_rgp_count:.2f}%) ' + 'RGPs that are located at a contig border and are likely incomplete.') + + if len(valid_rgps) == 0: + raise Exception( + "The pangenome has no complete RGPs. The clustering of RGP is then not possible.") + else: + valid_rgps = set(pangenome.regions) + + dereplicated_rgps = dereplicate_rgp(valid_rgps, disable_bar=disable_bar) + + grr_graph = nx.Graph() + grr_graph.add_nodes_from((rgp.ID for rgp in dereplicated_rgps)) + + # Get all pairs of RGP that share at least one family + + family2rgp = defaultdict(set) + for rgp in dereplicated_rgps: + for fam in rgp.families: + family2rgp[fam].add(rgp) + + rgp_pairs = set() + for rgps in family2rgp.values(): + rgp_pairs |= {tuple(sorted(rgp_pair)) for rgp_pair in combinations(rgps, 2)} + + pairs_count = len(rgp_pairs) + + logging.info( + f'Computing GRR metric for {pairs_count:,} pairs of RGP.') + + pairs_of_rgps_metrics = [] + + for rgp_a, rgp_b in rgp_pairs: + + pair_metrics = compute_rgp_metric(rgp_a, rgp_b, grr_cutoff, grr_metric) + + if pair_metrics: + pairs_of_rgps_metrics.append(pair_metrics) + + grr_graph.add_edges_from(pairs_of_rgps_metrics) + + identical_rgps_objects = [rgp for rgp in dereplicated_rgps if isinstance(rgp, IdenticalRegions)] + rgp_objects_in_graph = [rgp for rgp in dereplicated_rgps if isinstance(rgp, Region)] + + if unmerge_identical_rgps: + rgp_objects_in_graph += add_edges_to_identical_rgps(grr_graph, identical_rgps_objects) + + # cluster rgp based on grr value + logging.info( + f"Louvain_communities clustering of RGP based on {grr_metric} on {grr_graph}.") + + cluster_rgp_on_grr(grr_graph, grr_metric) + + rgp_to_spot = {region: int(spot.ID) + for spot in pangenome.spots for region in spot.regions} + + if not unmerge_identical_rgps: + logging.info("Add info on identical RGPs merged in the graph") + add_info_to_identical_rgps(grr_graph, identical_rgps_objects, rgp_to_spot) + + rgps_in_graph = rgp_objects_in_graph if unmerge_identical_rgps else dereplicated_rgps + + # add some attribute to the graph nodes. + logging.info("Add RGP information to the graph") + add_info_to_rgp_nodes(grr_graph, rgp_objects_in_graph, rgp_to_spot) + + if need_metadata: + add_rgp_metadata_to_graph(grr_graph, rgps_in_graph) + + if "gexf" in graph_formats: + # writting graph in gexf format + graph_file_name = os.path.join(output, f"{basename}.gexf") + logging.info(f"Writting graph in gexf format in {graph_file_name}.") + nx.readwrite.gexf.write_gexf(grr_graph, graph_file_name) + + if "graphml" in graph_formats: + graph_file_name = os.path.join(output, f"{basename}.graphml") + logging.info(f"Writting graph in graphml format in {graph_file_name}.") + nx.readwrite.graphml.write_graphml(grr_graph, graph_file_name) + + outfile = os.path.join(output, f"{basename}.tsv") + logging.info(f"Writting rgp clusters in tsv format in {outfile}") + + write_rgp_cluster_table( + outfile, grr_graph, rgps_in_graph, grr_metric, rgp_to_spot) + + +def launch(args: argparse.Namespace): + """ + Command launcher + + :param args: All arguments provided by user + """ + pangenome = Pangenome() + + mk_outdir(args.output, args.force) + + pangenome.add_file(args.pangenome) + + cluster_rgp(pangenome, grr_cutoff=args.grr_cutoff, output=args.output, + basename=args.basename, ignore_incomplete_rgp=args.ignore_incomplete_rgp, + unmerge_identical_rgps=args.no_identical_rgp_merging, + grr_metric=args.grr_metric, disable_bar=args.disable_prog_bar, graph_formats=args.graph_formats) + + +def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: + """ + Subparser to launch PPanGGOLiN in Command line + + :param sub_parser : Sub_parser for cluster_rgp command + + :return : Parser arguments for cluster_rgp command + """ + parser = sub_parser.add_parser( + "rgp_cluster", formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser_cluster_rgp(parser) + return parser + + +def parser_cluster_rgp(parser: argparse.ArgumentParser): + """ + Parser for specific argument of rgp command + + :param parser: Parser for cluster_rgp argument + """ + required = parser.add_argument_group(title="Required arguments", + description="One of the following arguments is required :") + required.add_argument('-p', '--pangenome', required=True, + type=Path, help="The pangenome .h5 file") + + optional = parser.add_argument_group(title="Optional arguments") + + optional.add_argument('--grr_cutoff', required=False, type=restricted_float, default=0.8, + help="Min gene repertoire relatedness metric used in the rgp clustering") + optional.add_argument('--grr_metric', required=False, type=str, default="incomplete_aware_grr", + help="The grr (Gene Repertoire Relatedness) is used to assess the similarity between two RGPs based on their gene families. " + "There are three different modes for calculating the grr value: 'min_grr', 'max_grr' or 'incomplete_aware_grr'." + " 'min_grr': Computes the number of gene families shared between the two RGPs and divides it by the smaller number of gene families among the two RGPs. " + " 'max_grr': Calculates the number of gene families shared between the two RGPs and divides it by the larger number of gene families among the two RGPs. " + " 'incomplete_aware_grr' (default): If at least one RGP is considered incomplete, which occurs when it is located at the border of a contig, " + "the 'min_grr' mode is used. Otherwise, the 'max_grr' mode is applied.", + choices=['incomplete_aware_grr', "min_grr", "max_grr"]) + + optional.add_argument('--ignore_incomplete_rgp', required=False, action="store_true", + help="Do not cluster RGPs located on a contig border which are likely incomplete.") + + optional.add_argument('--no_identical_rgp_merging', required=False, action="store_true", + help="Do not merge in one node identical RGP (i.e. having the same family content) before clustering.") + + optional.add_argument("--basename", required=False, + default="rgp_cluster", help="basename for the output file") + + optional.add_argument('-o', '--output', required=False, type=Path, + default="rgp_clustering", help="Output directory") + + optional.add_argument('--graph_formats', required=False, type=str, choices=['gexf', "graphml"], nargs="+", + default=['gexf'], help="Format of the output graph.") diff --git a/ppanggolin/RGP/spot.py b/ppanggolin/RGP/spot.py index 617c6e64..00b6e7a6 100644 --- a/ppanggolin/RGP/spot.py +++ b/ppanggolin/RGP/spot.py @@ -209,7 +209,8 @@ def predict_hotspots(pangenome: Pangenome, output: Path, spot_graph: bool = Fals spots.append(curr_spot) for node in comp: - curr_spot.add_regions(graph_spot.nodes[node]["rgp"]) + for region in graph_spot.nodes[node]["rgp"]: + curr_spot.add(region) if spot_graph: graph_spot.nodes[node]["spot_id"] = str(curr_spot) @@ -220,8 +221,8 @@ def predict_hotspots(pangenome: Pangenome, output: Path, spot_graph: bool = Fals logging.getLogger("PPanGGOLiN").warning("No spots were detected.") else: logging.getLogger("PPanGGOLiN").info(f"{len(spots)} spots were detected") - - pangenome.add_spots(spots) + for spot in spots: + pangenome.add_spot(spot) pangenome.status["spots"] = "Computed" pangenome.parameters["spot"] = {} pangenome.parameters["spot"]["set_size"] = set_size @@ -270,8 +271,9 @@ def parser_spot(parser: argparse.ArgumentParser): required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file") optional = parser.add_argument_group(title="Optional arguments") optional.add_argument('-o', '--output', required=False, type=Path, - default=Path(f"ppanggolin_output{time.strftime('DATE%Y-%m-%d_HOUR%H.%M.%S', time.localtime())}" - f"_PID{str(os.getpid())}"), + default=Path( + f"ppanggolin_output{time.strftime('DATE%Y-%m-%d_HOUR%H.%M.%S', time.localtime())}" + f"_PID{str(os.getpid())}"), help="Output directory") optional.add_argument("--spot_graph", required=False, action="store_true", help="Writes a graph of pairs of blocks of single copy markers flanking RGPs," diff --git a/ppanggolin/__init__.py b/ppanggolin/__init__.py index 8c047754..f8626afa 100755 --- a/ppanggolin/__init__.py +++ b/ppanggolin/__init__.py @@ -37,7 +37,6 @@ "module": ppanggolin.mod.subparser, "context": ppanggolin.context.subparser, "projection":ppanggolin.projection.subparser, - # "info":ppanggolin.info.subparser, "default_config":ppanggolin.utility.default_config.subparser - # "info":ppanggolin.info.subparser, "default_config":ppanggolin.utility.default_config.subparser + "rgp_cluster":ppanggolin.RGP.rgp_cluster.subparser, "metadata": ppanggolin.meta.subparser } diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index e11ea330..2382b461 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -306,7 +306,7 @@ def get_fam_to_spot(pangenome: Pangenome, multigenics: Set[GeneFamily]) \ fams = set() fams_border = set() for rgp in spot.regions: - fams |= rgp.families + fams |= set(rgp.families) fams_border |= set([gene.family for border in # Set of families in border of spot rgp.get_bordering_genes(pangenome.parameters["spot"]["set_size"], multigenics) for gene in border]) diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 561eb451..6b45efc3 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -18,7 +18,7 @@ from ppanggolin.annotate.synta import annotate_organism, read_fasta, get_dna_sequence from ppanggolin.pangenome import Pangenome from ppanggolin.genome import Organism, Gene, RNA, Contig -from ppanggolin.utils import read_compressed_or_not, mk_file_name, detect_filetype +from ppanggolin.utils import read_compressed_or_not, mk_file_name, detect_filetype, check_input_files from ppanggolin.formats import write_pangenome @@ -33,6 +33,12 @@ def check_annotate_args(args): raise Exception("You must provide at least a file with the --fasta option to annotate from sequences, " "or a file with the --gff option to load annotations from.") + + if hasattr(args, "fasta") and args.fasta is not None: + check_input_files(args.fasta, True) + + if hasattr(args, "anno") and args.anno is not None: + check_input_files(args.anno, True) def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: int, gene_id: str, dbxref: set, start: int, stop: int, strand: str, gene_type: str, position: int = None, gene_name: str = "", @@ -77,7 +83,7 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i new_gene.fill_annotations(start=start, stop=stop, strand=strand, gene_type=gene_type, name=gene_name, position=position, product=product, local_identifier=gene_id, genetic_code=genetic_code) - contig.add_gene(new_gene) + contig[new_gene.start] = new_gene else: # if not CDS, it is RNA new_gene = RNA(org.name + "_RNA_" + str(rna_counter).zfill(4)) new_gene.fill_annotations(start=start, stop=stop, strand=strand, gene_type=gene_type, name=gene_name, @@ -121,18 +127,22 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p if line.startswith('VERSION'): contig_id = line[12:].strip() if contig_id != "": - if contig_id in circular_contigs: - is_circ = True - contig = org.get_contig(contig_id, is_circ) + try: + contig = org.get(contig_id) + except KeyError: + contig = Contig(contig_id, True if contig_id in circular_contigs else False) + org.add(contig) set_contig = True line = lines.pop() if not set_contig: # if no contig ids were filled after VERSION, we use what was found in LOCUS for the contig ID. # Should be unique in a dataset, but if there's an update the contig ID # might still be the same even though it should not(?) - if contig_locus_id in circular_contigs: - is_circ = True - contig = org.get_contig(contig_locus_id, is_circ) + try: + contig = org.get(contig_locus_id) + except KeyError: + contig = Contig(contig_locus_id, True if contig_locus_id in circular_contigs else False) + org.add(contig) # start of the feature object. dbxref = set() gene_name = "" @@ -143,15 +153,15 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p genetic_code = "" useful_info = False start = None - end = None + stop = None strand = None line = lines.pop() while not line.startswith("ORIGIN"): curr_type = line[5:21].strip() if curr_type != "": if useful_info: - create_gene(org, contig, gene_counter, rna_counter, locus_tag, dbxref, start, end, strand, obj_type, - len(contig.genes), gene_name, product, genetic_code, protein_id) + create_gene(org, contig, gene_counter, rna_counter, locus_tag, dbxref, start, stop, strand, obj_type, + contig.number_of_genes, gene_name, product, genetic_code, protein_id) if obj_type == "CDS": gene_counter += 1 else: @@ -166,18 +176,18 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p useful_info = True if line[21:].startswith('complement('): strand = "-" - start, end = line[32:].strip().replace( - ')', '').split("..") + start, stop = line[32:].strip().replace(')', '').split("..") else: strand = "+" - start, end = line[21:].strip().split('..') - if '>' in start or '<' in start or '>' in end or '<' in end: + start, stop = line[21:].strip().split('..') + if '>' in start or '<' in start or '>' in stop or '<' in stop: if not pseudo: # pseudogene likely useful_info = False else: start = start.replace('>', '').replace('<', '') - end = end.replace('>', '').replace('<', '') + stop = stop.replace('>', '').replace('<', '') + start, stop = map(int, [start, stop]) except ValueError: pass # don't know what to do with that, ignoring for now. @@ -210,8 +220,8 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p line = lines.pop() # end of contig if useful_info: # saving the last element... - create_gene(org, contig, gene_counter, rna_counter, locus_tag, dbxref, start, end, strand, obj_type, - len(contig.genes), gene_name, product, genetic_code, protein_id) + create_gene(org, contig, gene_counter, rna_counter, locus_tag, dbxref, start, stop, strand, obj_type, + contig.number_of_genes, gene_name, product, genetic_code, protein_id) if obj_type == "CDS": gene_counter += 1 else: @@ -226,7 +236,7 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p line = lines.pop() # get each gene's sequence. for gene in contig.genes: - gene.add_dna(get_dna_sequence(sequence, gene)) + gene.add_sequence(get_dna_sequence(sequence, gene)) return org, True @@ -292,7 +302,12 @@ def get_id_attribute(attributes_dict: dict) -> str: has_fasta = True elif line.startswith('sequence-region', 2, 17): fields = [el.strip() for el in line.split()] - contig = org.get_contig(fields[1], True if fields[1] in circular_contigs else False) + try: + contig = org.get(fields[1]) + except KeyError: + contig = Contig(fields[1], True if fields[1] in circular_contigs else False) + org.add(contig) + continue elif line.startswith('#'): # comment lines to be ignores by parsers continue @@ -331,18 +346,21 @@ def get_id_attribute(attributes_dict: dict) -> str: genetic_code = 11 if contig is None or contig.name != fields_gff[gff_seqname]: # get the current contig - contig = org.get_contig(fields_gff[gff_seqname], - True if fields_gff[gff_seqname] in circular_contigs else False) + try: + contig = org.get(fields_gff[gff_seqname]) + except KeyError: + contig = Contig(fields_gff[gff_seqname], + True if fields_gff[gff_seqname] in circular_contigs else False) + org.add(contig) if fields_gff[gff_type] == "CDS" and (not pseudogene or (pseudogene and pseudo)): gene = Gene(org.name + "_CDS_" + str(gene_counter).zfill(4)) # here contig is filled in order, so position is the number of genes already stored in the contig. gene.fill_annotations(start=int(fields_gff[gff_start]), stop=int(fields_gff[gff_end]), strand=fields_gff[gff_strand], gene_type=fields_gff[gff_type], name=name, - position=len(contig.genes), product=product, local_identifier=gene_id, + position=contig.number_of_genes, product=product, local_identifier=gene_id, genetic_code=genetic_code) gene.fill_parents(org, contig) - contig.add_gene(gene) gene_counter += 1 elif "RNA" in fields_gff[gff_type]: rna = RNA(org.name + "_CDS_" + str(rna_counter).zfill(4)) @@ -350,7 +368,6 @@ def get_id_attribute(attributes_dict: dict) -> str: strand=fields_gff[gff_strand], gene_type=fields_gff[gff_type], name=name, product=product, local_identifier=gene_id) rna.fill_parents(org, contig) - contig.add_rna(rna) rna_counter += 1 # GET THE FASTA SEQUENCES OF THE GENES @@ -358,9 +375,9 @@ def get_id_attribute(attributes_dict: dict) -> str: contig_sequences, _ = read_fasta(org, fasta_string.split('\n')) # _ is total contig length for contig in org.contigs: for gene in contig.genes: - gene.add_dna(get_dna_sequence(contig_sequences[contig.name], gene)) + gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene)) for rna in contig.RNAs: - rna.add_dna(get_dna_sequence(contig_sequences[contig.name], rna)) + rna.add_sequence(get_dna_sequence(contig_sequences[contig.name], rna)) return org, has_fasta @@ -445,8 +462,6 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p args = [] for line in read_compressed_or_not(organisms_file): elements = [el.strip() for el in line.split("\t")] - if len(elements) <= 1: - raise Exception(f"No tabulation separator found in given --fasta file: '{organisms_file}'") org_path = Path(elements[1]) if not org_path.exists(): # Check tsv sanity test if it's not one it's the other org_path = organisms_file.parent.joinpath(org_path) @@ -496,17 +511,17 @@ def get_gene_sequences_from_fastas(pangenome, fasta_file): with read_compressed_or_not(elements[1]) as currFastaFile: fasta_dict[org], _ = read_fasta(org, currFastaFile) if set(pangenome.organisms) > set(fasta_dict.keys()): - missing = len(pangenome.organisms) - len(set(pangenome.organisms) & set(fasta_dict.keys())) + missing = pangenome.number_of_organisms() - len(set(pangenome.organisms) & set(fasta_dict.keys())) raise Exception(f"Not all of your pangenome organisms are present within the provided fasta file. " - f"{missing} are missing (out of {len(pangenome.organisms)}).") + f"{missing} are missing (out of {pangenome.number_of_organisms()}).") for org in pangenome.organisms: for contig in org.contigs: try: for gene in contig.genes: - gene.add_dna(get_dna_sequence(fasta_dict[org][contig.name], gene)) + gene.add_sequence(get_dna_sequence(fasta_dict[org][contig.name], gene)) for rna in contig.RNAs: - rna.add_dna(get_dna_sequence(fasta_dict[org][contig.name], rna)) + rna.add_sequence(get_dna_sequence(fasta_dict[org][contig.name], rna)) except KeyError: msg = f"Fasta file for organism {org.name} did not have the contig {contig.name} " \ f"that was read from the annotation file. " @@ -527,7 +542,7 @@ def launch_annotate_organism(pack: tuple) -> Organism: def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: int = 1, translation_table: int = 11, - kingdom: str = "bacteria", norna: bool = False, overlap: bool = True, procedure: str = None, + kingdom: str = "bacteria", norna: bool = False, allow_overlap: bool = False, procedure: str = None, disable_bar: bool = False): """ Main function to annotate a pangenome @@ -539,7 +554,7 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: :param translation_table: Translation table (genetic code) to use. :param kingdom: Kingdom to which the prokaryota belongs to, to know which models to use for rRNA annotation. :param norna: Use to avoid annotating RNA features. - :param overlap: Use to not remove genes overlapping with RNA features + :param allow_overlap: Use to not remove genes overlapping with RNA features :param procedure: prodigal procedure used :param disable_bar: Disable the progresse bar """ @@ -548,16 +563,20 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: arguments = [] # Argument given to annotate organism in same order than prototype for line in read_compressed_or_not(fasta_list): + elements = [el.strip() for el in line.split("\t")] - if len(elements) <= 1: # TODO remove ? Already tested by check TSV sanity - raise Exception("No tabulation separator found in organisms file") org_path = Path(elements[1]) + if not org_path.exists(): # Check tsv sanity test if it's not one it's the other org_path = fasta_list.parent.joinpath(org_path) + arguments.append((elements[0], org_path, elements[2:], tmpdir, translation_table, - norna, kingdom, overlap, procedure)) + norna, kingdom, allow_overlap, procedure)) + if len(arguments) == 0: raise Exception("There are no genomes in the provided file") + + logging.getLogger("PPanGGOLiN").info(f"Annotating {len(arguments)} genomes using {cpu} cpus...") with get_context('fork').Pool(processes=cpu) as p: for organism in tqdm(p.imap_unordered(launch_annotate_organism, arguments), unit="genome", @@ -570,14 +589,14 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: pangenome.status["genomesAnnotated"] = "Computed" # the pangenome is now annotated. pangenome.status["geneSequences"] = "Computed" # the gene objects have their respective gene sequences. pangenome.parameters["annotate"] = {} - pangenome.parameters["annotate"]["allow_overlap"] = overlap pangenome.parameters["annotate"]["norna"] = norna pangenome.parameters["annotate"]["kingdom"] = kingdom pangenome.parameters["annotate"]["translation_table"] = translation_table pangenome.parameters["annotate"]["prodigal_procedure"] = None if procedure is None else procedure + pangenome.parameters["annotate"]["allow_overlap"] = allow_overlap + pangenome.parameters["annotate"]["norna"] = norna pangenome.parameters["annotate"]["# read_annotations_from_file"] = False - def launch(args: argparse.Namespace): """ Command launcher @@ -590,7 +609,7 @@ def launch(args: argparse.Namespace): if args.fasta is not None and args.anno is None: annotate_pangenome(pangenome, args.fasta, tmpdir=args.tmpdir, cpu=args.cpu, procedure=args.prodigal_procedure, translation_table=args.translation_table, kingdom=args.kingdom, norna=args.norna, - overlap=args.allow_overlap, disable_bar=args.disable_prog_bar) + allow_overlap=args.allow_overlap, disable_bar=args.disable_prog_bar) elif args.anno is not None: read_annotations(pangenome, args.anno, cpu=args.cpu, pseudo=args.use_pseudo, disable_bar=args.disable_prog_bar) if pangenome.status["geneSequences"] == "No": diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 42953743..afad1cca 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -13,7 +13,7 @@ from pathlib import Path # local libraries -from ppanggolin.genome import Organism, Gene, RNA +from ppanggolin.genome import Organism, Gene, RNA, Contig from ppanggolin.utils import is_compressed, read_compressed_or_not @@ -143,16 +143,11 @@ def launch_infernal(fna_file: str, org: Organism, tmpdir: str, kingdom: str = " c += 1 line_data = line.split() strand = line_data[9] - if strand == "-": - start = line_data[8] - stop = line_data[7] - else: - start, stop = map(int, (line_data[7], line_data[8])) + start, stop = map(int, (line_data[8], line_data[7]) if strand == "-" else (line_data[7], line_data[8])) gene = RNA(rna_id=locustag + "_rRNA_" + str(c).zfill(3)) gene.fill_annotations(start=start, stop=stop, strand=strand, gene_type="rRNA", product=" ".join(line_data[17:])) gene_objs[line_data[2]].add(gene) - return gene_objs @@ -175,7 +170,11 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> (dict, in contigs[contig.name] = contig_seq.upper() all_contig_len += len(contig_seq) contig_seq = "" - contig = org.get_contig(line.split()[0][1:]) + try: + contig = org.get(line.split()[0][1:]) + except KeyError: + contig = Contig(line.split()[0][1:]) + org.add(contig) else: contig_seq += line.strip() if len(contig_seq) >= 1: # processing the last contig @@ -242,12 +241,12 @@ def syntaxic_annotation(org: Organism, fasta_file: TextIOWrapper, tmpdir: str, n return genes -def overlap_filter(all_genes: defaultdict, overlap: bool = True) -> defaultdict: +def overlap_filter(all_genes: defaultdict, allow_overlap: bool = False) -> defaultdict: """ Removes the CDS that overlap with RNA genes. :param all_genes: Dictionary with complete list of genes - :param overlap: Allow to filter overlap + :param allow_overlap: Use to not remove genes overlapping with RNA features :return: Dictionary with genes filtered """ @@ -256,7 +255,7 @@ def overlap_filter(all_genes: defaultdict, overlap: bool = True) -> defaultdict: for key, genes in all_genes.items(): tmp_genes = sorted(genes, key=lambda x: x.start) rm_genes = set() - if overlap: + if not allow_overlap: for i, gene_i in enumerate(tmp_genes): if i + 1 < len(tmp_genes): gene_j = tmp_genes[i + 1] @@ -293,7 +292,7 @@ def get_dna_sequence(contig_seq: str, gene: Gene) -> str: def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: str, code: int = 11, norna: bool = False, kingdom: str = "bacteria", - overlap: bool = True, procedure: str = None) -> Organism: + allow_overlap: bool = False, procedure: str = None) -> Organism: """ Function to annotate a single organism @@ -304,7 +303,7 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: :param kingdom: Kingdom to which the prokaryota belongs to, to know which models to use for rRNA annotation. :param norna: Use to avoid annotating RNA features. :param tmpdir: Path to temporary directory - :param overlap: Use to not remove genes overlapping with RNA features + :param allow_overlap: Use to not remove genes overlapping with RNA features :param procedure: prodigal procedure used :return: Complete organism object for pangenome @@ -323,17 +322,19 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: else: procedure = "single" genes = syntaxic_annotation(org, fasta_file, tmpdir, norna, kingdom, code, procedure) - genes = overlap_filter(genes, overlap) + genes = overlap_filter(genes, allow_overlap=allow_overlap) for contig_name, genes in genes.items(): - contig = org.get_contig(contig_name) - if contig.name in circular_contigs: - contig.is_circular = True + try: + contig = org.get(contig_name) + except KeyError: + contig = Contig(contig_name, True if contig_name in circular_contigs else False) + org.add(contig) for gene in genes: - gene.add_dna(get_dna_sequence(contig_sequences[contig.name], gene)) + gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene)) gene.fill_parents(org, contig) if isinstance(gene, Gene): - contig.add_gene(gene) + contig[gene.start] = gene elif isinstance(gene, RNA): contig.add_rna(gene) return org diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 649ca6f1..f8f2490c 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -19,6 +19,7 @@ # local libraries from ppanggolin.pangenome import Pangenome from ppanggolin.genome import Gene +from ppanggolin.geneFamily import GeneFamily from ppanggolin.utils import read_compressed_or_not, restricted_float from ppanggolin.formats.writeBinaries import write_pangenome, erase_pangenome from ppanggolin.formats.readBinaries import check_pangenome_info, get_gene_sequences_from_file @@ -244,8 +245,9 @@ def read_fam2seq(pangenome: Pangenome, fam_to_seq: Dict[str, str]): """ logging.getLogger("PPanGGOLiN").info("Adding protein sequences to the gene families") for family, protein in fam_to_seq.items(): - fam = pangenome.add_gene_family(family) + fam = GeneFamily(pangenome.max_fam_id, family) fam.add_sequence(protein) + pangenome.add_gene_family(fam) def read_gene2fam(pangenome: Pangenome, gene_to_fam: dict, disable_bar: bool = False): @@ -259,19 +261,23 @@ def read_gene2fam(pangenome: Pangenome, gene_to_fam: dict, disable_bar: bool = F logging.getLogger("PPanGGOLiN").info(f"Adding {len(gene_to_fam)} genes to the gene families") link = True if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"] else False - if link and len(gene_to_fam) != len(pangenome.genes): # then maybe there are genes with identical IDs + if link and len(gene_to_fam) != pangenome.number_of_genes: # then maybe there are genes with identical IDs raise Exception("Something unexpected happened during clustering (have less genes clustered than genes " "in the pangenome). A probable reason is that two genes in two different organisms have " "the same IDs; If you are sure that all of your genes have non identical IDs, please post an " "issue at https://github.com/labgem/PPanGGOLiN/") for gene, (family, is_frag) in tqdm(gene_to_fam.items(), unit="gene", total=len(gene_to_fam), disable=disable_bar): - fam = pangenome.add_gene_family(family) + try: + fam = pangenome.get_gene_family(family) + except KeyError: # Family not found so create and add + fam = GeneFamily(pangenome.max_fam_id, family) + pangenome.add_gene_family(fam) if link: # doing the linking if the annotations are loaded. gene_obj = pangenome.get_gene(gene) else: gene_obj = Gene(gene) gene_obj.is_fragment = is_frag - fam.add_gene(gene_obj) + fam.add(gene_obj) def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool = True, code: int = 11, @@ -369,7 +375,9 @@ def infer_singletons(pangenome: Pangenome): singleton_counter = 0 for gene in pangenome.genes: if gene.family is None: - pangenome.add_gene_family(gene.ID).add_gene(gene) + fam = GeneFamily(family_id=pangenome.max_fam_id, name=gene.ID) + fam.add(gene) + pangenome.add_gene_family(fam) singleton_counter += 1 logging.getLogger("PPanGGOLiN").info(f"Inferred {singleton_counter} singleton families") @@ -411,16 +419,20 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet gene_obj = local_dict.get(gene_id) if gene_obj is not None: nb_gene_with_fam += 1 - fam = pangenome.add_gene_family(fam_id) + try: + fam = pangenome.get_gene_family(fam_id) + except KeyError: # Family not found so create and add + fam = GeneFamily(pangenome.max_fam_id, fam_id) + pangenome.add_gene_family(fam) gene_obj.is_fragment = True if is_frag == "F" else False # F for Fragment - fam.add_gene(gene_obj) + fam.add(gene_obj) if is_frag == "F": frag = True except Exception: raise Exception(f"line {line_counter} of the file '{families_tsv_file.name}' raised an error.") bar.close() families_tsv_file.close() - if nb_gene_with_fam < len(pangenome.genes): # not all genes have an associated cluster + if nb_gene_with_fam < pangenome.number_of_genes: # not all genes have an associated cluster if nb_gene_with_fam == 0: raise Exception("No gene ID in the cluster file matched any gene ID from the annotation step." " Please ensure that the annotations that you loaded previously and the clustering results " @@ -430,7 +442,7 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet if infer_singleton: infer_singletons(pangenome) else: - raise Exception(f"Some genes ({len(pangenome.genes) - nb_gene_with_fam}) did not have an associated " + raise Exception(f"Some genes ({pangenome.number_of_genes - nb_gene_with_fam}) did not have an associated " f"cluster. Either change your cluster file so that each gene has a cluster, " f"or use the --infer_singletons option to infer a cluster for each non-clustered gene.") pangenome.status["genesClustered"] = "Computed" diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index f5161f99..339c271e 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -7,6 +7,7 @@ import tempfile import time from pathlib import Path +from typing import Set # installed libraries from tqdm import tqdm @@ -17,6 +18,7 @@ from ppanggolin.formats import check_pangenome_info from ppanggolin.genome import Gene, Contig from ppanggolin.utils import mk_outdir, restricted_float, add_gene, connected_components, create_tmpdir +from ppanggolin.geneFamily import GeneFamily from ppanggolin.pangenome import Pangenome from ppanggolin.align.alignOnPang import project_and_write_partition, get_input_seq_to_family_with_rep, get_input_seq_to_family_with_all from ppanggolin.region import GeneContext @@ -93,7 +95,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: families = set() for gene_context in common_components: - families |= gene_context.families + families |= set(gene_context.families) if len(families) != 0: export_to_dataframe(families, common_components, fam_2_seq, output) @@ -117,7 +119,7 @@ def compute_gene_context_graph(families: dict, t: int = 4, disable_bar: bool = F g = nx.Graph() for family in tqdm(families.values(), unit="families", disable=disable_bar): for gene in family.genes: - contig = gene.contig.genes + contig = list(gene.contig.genes) pos_left, in_context_left, pos_right, in_context_right = extract_gene_context(gene, contig, families, t) if in_context_left or in_context_right: for env_gene in contig[pos_left:pos_right + 1]: @@ -214,7 +216,7 @@ def fam2seq(seq_to_pan: dict) -> dict: return fam_2_seq -def export_to_dataframe(families: set, gene_contexts: set, fam_to_seq: dict, output: str): +def export_to_dataframe(families: Set[GeneFamily], gene_contexts: Set[GeneContext], fam_to_seq: dict, output: str): """ Export the results into dataFrame :param families: Families related to the connected components @@ -230,10 +232,10 @@ def export_to_dataframe(families: set, gene_contexts: set, fam_to_seq: dict, out for family in gene_context.families: line = [gene_context.ID] if fam_to_seq is None or fam_to_seq.get(family.ID) is None: - line += [family.name, None, len(family.organisms), family.named_partition] + line += [family.name, None, family.number_of_organisms, family.named_partition] else: line += [family.name, ','.join(fam_to_seq.get(family.ID)), - len(family.organisms), family.named_partition] + family.number_of_organisms, family.named_partition] lines.append(line) df = pd.DataFrame(lines, columns=["GeneContext ID", "Gene family name", "Sequence ID", "Nb Genomes", "Partition"] diff --git a/ppanggolin/edge.py b/ppanggolin/edge.py index eea36c73..903f8aa6 100644 --- a/ppanggolin/edge.py +++ b/ppanggolin/edge.py @@ -3,7 +3,7 @@ # default libraries from collections import defaultdict -from typing import Dict, List, Tuple +from typing import Dict, Generator, List, Tuple from ppanggolin.genome import Gene, Organism @@ -11,50 +11,96 @@ class Edge: """The Edge class represents an edge between two gene families in the pangenome graph. It is associated with all the organisms in which the neighborship is found, and all the involved genes as well. + Methods: + - get_org_dict: Returns a dictionary with organisms as keys and an iterable of the pairs in genes as values. + - gene_pairs: Returns a list of all the gene pairs in the Edge. + - add_genes: Adds genes to the edge. They are supposed to be in the same organism. - :param source_gene: a first gene to initialize the edge - :param target_gene: a second gene to initialize the edge + Fields: + - source: A GeneFamily object representing the source gene family of the edge. + - target: A GeneFamily object representing the target gene family of the edge. + - organisms: A defaultdict object representing the organisms in which the edge is found and the pairs of genes involved. """ def __init__(self, source_gene: Gene, target_gene: Gene): + """Constructor method + + :param source_gene: First gene to initialize the edge + :param target_gene: Second gene to initialize the edge + """ + # TODO try to change for gene family ? if source_gene.family is None: - raise Exception(f"You cannot create a graph without gene families. " - f"gene {source_gene.ID} did not have a gene family.") + raise AttributeError(f"You cannot create a graph without gene families. " + f"gene {source_gene.ID} did not have a gene family.") if target_gene.family is None: - raise Exception(f"You cannot create a graph without gene families. " - f"gene {target_gene.ID} did not have a gene family.") + raise AttributeError(f"You cannot create a graph without gene families. " + f"gene {target_gene.ID} did not have a gene family.") self.source = source_gene.family self.target = target_gene.family - self.source._edges[self.target] = self - self.target._edges[self.source] = self - self.organisms = defaultdict(list) + self.source.set_edge(self.target, self) + self.target.set_edge(self.source, self) + self._organisms = defaultdict(list) self.add_genes(source_gene, target_gene) - def get_org_dict(self) -> Dict[Organism, List[Tuple[Gene, Gene]]]: - """ Create a dictionnary of the Organisms in which the edge is found + @property + def organisms(self) -> Generator[Organism, None, None]: + """Get all the organisms belonging to the edge - :return: Dictionary with organisms as key and an iterable of the pairs of genes as value + :return: Generator with organisms as the key and an iterable of the gene pairs as value """ - return self.organisms + for organism in self._organisms.keys(): + yield organism + + @property + def number_of_organisms(self) -> int: + """Get the number of organisms in the edge + + :return: Number of organisms + """ + return len(self._organisms) + + def get_organism_genes_pairs(self, organism: Organism) -> List[Tuple[Gene, Gene]]: + """Get the gene pair corresponding to the given organism + + :param organism: Wanted organism + + :return: Pair of genes in the edge corresponding to the given organism + """ + return self._organisms[organism] + + def get_organisms_dict(self) -> Dict[Organism, List[Tuple[Gene, Gene]]]: + """Get all the organisms with their corresponding pair of genes in the edge + + :return: Dictionary with the organism as the key and list of gene pairs as value + """ + return self._organisms @property def gene_pairs(self) -> List[Tuple[Gene, Gene]]: - """ Get list of all the gene pairs of the Edge + """ Get the list of all the gene pairs in the Edge - :return: A list of all the gene pairs of the Edge + :return: A list of all the gene pairs in the Edge """ - return [gene_pair for gene_list in self.organisms.values() for gene_pair in gene_list] + return [gene_pair for gene_list in self.get_organisms_dict().values() for gene_pair in gene_list] def add_genes(self, source_gene: Gene, target_gene: Gene): - """Adds genes to the edge. They are supposed to be on the same organism. + """ + Adds genes to the edge. + They are supposed to be in the same organism. - :param source_gene: a source gene to add to the edge - :param target_gene: a target gene to add to the edge + :param source_gene: Gene corresponding to the source of the edge + :param target_gene: Gene corresponding to the target of the edge - :raises Exception: If the genes are not on the same organism. + :raises TypeError: If the genes are not with Gene type + :raises ValueError: If genes are not associated with an organism + :raises Exception: If the genes are not in the same organism. """ - org = source_gene.organism - if org != target_gene.organism: + if not isinstance(source_gene, Gene) or not isinstance(target_gene, Gene): + raise TypeError(f"Genes are expected to be added to edge. " + f"Given type for source: {type(source_gene)} and target: {type(target_gene)}") + if source_gene.organism is None or target_gene.organism is None: + raise ValueError("Genes are not associated to organism. It's needed to create add genes to edge") + if source_gene.organism != target_gene.organism: raise Exception(f"You tried to create an edge between two genes that are not even in the same organism ! " f"(genes are '{source_gene.ID}' and '{target_gene.ID}')") - self.organisms[org].append((source_gene, target_gene)) + self._organisms[source_gene.organism].append((source_gene, target_gene)) diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index 6029501c..1c926682 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -174,9 +174,9 @@ def subgraph(spot: Spot, outname: str, with_border: bool = True, set_size: int = minpos = min([gene.position for border in borders for gene in border]) maxpos = max([gene.position for border in borders for gene in border]) else: - minpos = rgp.start_gene.position - maxpos = rgp.stop_gene.position - gene_list = rgp.contig.genes[minpos:maxpos + 1] + minpos = rgp.starter.position + maxpos = rgp.stopper.position + gene_list = rgp.contig.get_genes(minpos, maxpos + 1) prev = None for gene in gene_list: g.add_node(gene.family.name, partition=gene.family.named_partition) @@ -201,10 +201,10 @@ def subgraph(spot: Spot, outname: str, with_border: bool = True, set_size: int = try: g[gene.family.name][prev]["rgp"].add(rgp) except KeyError: - g[gene.family.name][prev]["rgp"] = set(rgp) + g[gene.family.name][prev]["rgp"] = {rgp} prev = gene.family.name for node1, node2 in g.edges: - g[node1][node2]["weight"] = len(g[node1][node2]["rgp"]) / len(spot.regions) + g[node1][node2]["weight"] = len(g[node1][node2]["rgp"]) / len(spot) del g[node1][node2]["rgp"] for node in g.nodes: if "name" in g.nodes[node]: @@ -592,7 +592,7 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome: borders = rgp.get_bordering_genes(set_size, multigenics) minpos = min([gene.position for border in borders for gene in border]) maxpos = max([gene.position for border in borders for gene in border]) - gene_list = rgp.contig.genes[minpos:maxpos + 1] + gene_list = rgp.contig.get_genes(minpos, maxpos + 1) minstart = min([gene.start for border in borders for gene in border]) maxstop = max([gene.stop for border in borders for gene in border]) rnas_toadd = set() @@ -615,7 +615,7 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome: uniq_gene_lists = [] ordered_counts = [] for genelist in gene_lists: - curr_genelist_count = count_uniq.get(genelist[2], None) + curr_genelist_count = count_uniq.get(genelist[2]) if curr_genelist_count is not None: uniq_gene_lists.append(genelist) ordered_counts.append(curr_genelist_count) diff --git a/ppanggolin/figures/tile_plot.py b/ppanggolin/figures/tile_plot.py index 2c6fa006..762d978d 100644 --- a/ppanggolin/figures/tile_plot.py +++ b/ppanggolin/figures/tile_plot.py @@ -34,7 +34,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=True, disable_bar=disable_bar) if pangenome.status["partitioned"] == "No": raise Exception("Cannot draw the tile plot as your pangenome has not been partitioned") - if len(pangenome.organisms) > 500 and nocloud is False: + if pangenome.number_of_organisms > 500 and nocloud is False: logging.getLogger("PPanGGOLiN").warning("You asked to draw a tile plot for a lot of organisms (>500). " "Your browser will probably not be able to open it.") logging.getLogger("PPanGGOLiN").info("Drawing the tile plot...") @@ -65,14 +65,13 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di index2fam[row] = fam.name fam2index[fam.name] = row - mat_p_a = csc_matrix((data, (all_indexes, all_columns)), shape=(len(families), len(pangenome.organisms)), + mat_p_a = csc_matrix((data, (all_indexes, all_columns)), shape=(len(families), pangenome.number_of_organisms), dtype='float') dist = pdist(1 - jaccard_similarities(mat_p_a, 0).todense()) hc = linkage(dist, 'single') - logging.getLogger("PPanGGOLiN").info("done with making the dendrogram to order the organisms on the plot") dendro_org = dendrogram(hc, no_plot=True) - logging.getLogger().info("done with making the dendrogram to order the organisms on the plot") + logging.getLogger("PPanGGOLiN").info("done with making the dendrogram to order the organisms on the plot") order_organisms = [index2org[index] for index in dendro_org["leaves"]] @@ -85,13 +84,13 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di partitions_dict[fam.partition].append(fam) if fam.partition.startswith("S"): shell_subs.add(fam.partition) # number of elements will tell the number of subpartitions - ordered_nodes_p = sorted(partitions_dict["P"], key=lambda n: len(n.organisms), reverse=True) - ordered_nodes_c = sorted(partitions_dict["C"], key=lambda n: len(n.organisms), reverse=True) + ordered_nodes_p = sorted(partitions_dict["P"], key=lambda n: n.number_of_organisms, reverse=True) + ordered_nodes_c = sorted(partitions_dict["C"], key=lambda n: n.number_of_organisms, reverse=True) sep_p = len(ordered_nodes_p) - 0.5 separators = [sep_p] shell_na = None if len(shell_subs) == 1: - ordered_nodes_s = sorted(partitions_dict[shell_subs.pop()], key=lambda n: len(n.organisms), reverse=True) + ordered_nodes_s = sorted(partitions_dict[shell_subs.pop()], key=lambda n: n.number_of_organisms, reverse=True) ordered_nodes = ordered_nodes_p + ordered_nodes_s + ordered_nodes_c separators.append(separators[len(separators) - 1] + len(ordered_nodes_s)) separators.append(separators[len(separators) - 1] + len(ordered_nodes_c)) @@ -100,7 +99,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di for subpartition in sorted(shell_subs): if subpartition == "S_": shell_na = len(separators) - 1 - ordered_nodes_s = sorted(partitions_dict[subpartition], key=lambda n: len(n.organisms), reverse=True) + ordered_nodes_s = sorted(partitions_dict[subpartition], key=lambda n: n.number_of_organisms, reverse=True) ordered_nodes += ordered_nodes_s separators.append(separators[len(separators) - 1] + len(ordered_nodes_s)) ordered_nodes += ordered_nodes_c @@ -110,7 +109,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di for node in ordered_nodes: fam_order.append('\u200c' + node.name) data = node.organisms - binary_data.append([len(node.get_genes_per_org(org)) if org in data else numpy.nan for org in order_organisms]) + binary_data.append([len(list(node.get_genes_per_org(org))) if org in data else numpy.nan for org in order_organisms]) text_data.append([("\n".join(map(str, node.get_genes_per_org(org)))) if org in data else numpy.nan for org in order_organisms]) @@ -154,9 +153,9 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di else: color = colors["shell"] shapes.append(dict(type='line', x0=-1, x1=-1, y0=sep_prec, y1=sep, line=dict(dict(width=10, color=color)))) - shapes.append(dict(type='line', x0=len(pangenome.organisms), x1=len(pangenome.organisms), y0=sep_prec, y1=sep, + shapes.append(dict(type='line', x0=pangenome.number_of_organisms, x1=pangenome.number_of_organisms, y0=sep_prec, y1=sep, line=dict(dict(width=10, color=color)))) - shapes.append(dict(type='line', x0=-1, x1=len(pangenome.organisms), y0=sep, y1=sep, + shapes.append(dict(type='line', x0=-1, x1=pangenome.number_of_organisms, y0=sep, y1=sep, line=dict(dict(width=1, color=color)))) sep_prec = sep @@ -188,4 +187,4 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di fig.update_layout(layout) out_plotly.plot(fig, filename=output.as_posix() + "/tile_plot.html", auto_open=False) - logging.getLogger("PPanGGOLiN").info(f"Done with the tile plot : '{output.as_posix() + '/tile_plot.html'}' ") + logging.getLogger("PPanGGOLiN").info(f"Done with the tile plot : '{output / 'tile_plot.html'}' ") diff --git a/ppanggolin/figures/ucurve.py b/ppanggolin/figures/ucurve.py index 4d31f2d4..1abd3749 100644 --- a/ppanggolin/figures/ucurve.py +++ b/ppanggolin/figures/ucurve.py @@ -28,7 +28,7 @@ def draw_ucurve(pangenome: Pangenome, output: Path, soft_core: float = 0.95, di is_partitioned = False has_undefined = False for fam in pangenome.gene_families: - nb_org = len(fam.organisms) + nb_org = fam.number_of_organisms if fam.partition != "": is_partitioned = True if fam.partition == "U": @@ -39,7 +39,7 @@ def draw_ucurve(pangenome: Pangenome, output: Path, soft_core: float = 0.95, di data_plot = [] chao = "NA" if count[1]["pangenome"] > 0: - chao = round(len(pangenome.gene_families) + ((count[0]["pangenome"] ^ 2) / (count[1]["pangenome"] * 2)), 2) + chao = round(pangenome.number_of_gene_families + ((count[0]["pangenome"] ^ 2) / (count[1]["pangenome"] * 2)), 2) colors = {"pangenome": "black", "exact_accessory": "#EB37ED", "exact_core": "#FF2828", "soft_core": "#c7c938", "soft_accessory": "#996633", "shell": "#00D860", "persistent": "#F7A507", "cloud": "#79DEFF", "undefined": "#828282"} @@ -48,24 +48,24 @@ def draw_ucurve(pangenome: Pangenome, output: Path, soft_core: float = 0.95, di persistent_values = [] shell_values = [] cloud_values = [] - for nb_org in range(1, len(pangenome.organisms) + 1): + for nb_org in range(1, pangenome.number_of_organisms + 1): persistent_values.append(count[nb_org]["persistent"]) shell_values.append(count[nb_org]["shell"]) cloud_values.append(count[nb_org]["cloud"]) - data_plot.append(go.Bar(x=list(range(1, len(pangenome.organisms) + 1)), y=persistent_values, name='persistent', + data_plot.append(go.Bar(x=list(range(1, pangenome.number_of_organisms + 1)), y=persistent_values, name='persistent', marker=dict(color=colors["persistent"]))) - data_plot.append(go.Bar(x=list(range(1, len(pangenome.organisms) + 1)), y=shell_values, name='shell', + data_plot.append(go.Bar(x=list(range(1, pangenome.number_of_organisms + 1)), y=shell_values, name='shell', marker=dict(color=colors["shell"]))) - data_plot.append(go.Bar(x=list(range(1, len(pangenome.organisms) + 1)), y=cloud_values, name='cloud', + data_plot.append(go.Bar(x=list(range(1, pangenome.number_of_organisms + 1)), y=cloud_values, name='cloud', marker=dict(color=colors["cloud"]))) else: text = 'undefined' if has_undefined else "pangenome" undefined_values = [] - for nb_org in range(1, len(pangenome.organisms) + 1): + for nb_org in range(1, pangenome.number_of_organisms + 1): undefined_values.append(count[nb_org][text]) - data_plot.append(go.Bar(x=list(range(1, len(pangenome.organisms) + 1)), y=undefined_values, name=text, + data_plot.append(go.Bar(x=list(range(1, pangenome.number_of_organisms + 1)), y=undefined_values, name=text, marker=dict(color=colors[text]))) - x = len(pangenome.organisms) * soft_core + x = pangenome.number_of_organisms * soft_core layout = go.Layout(title="Gene families frequency distribution (U shape), chao=" + str(chao), xaxis=dict(title='Occurring in x genomes'), yaxis=dict(title='# of gene families (F)'), diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index e7f831f0..6ab64796 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -13,9 +13,10 @@ import tables # local libraries -from ppanggolin.genome import Organism, Gene, RNA +from ppanggolin.genome import Organism, Gene, RNA, Contig from ppanggolin.pangenome import Pangenome -from ppanggolin.region import Spot, Module +from ppanggolin.geneFamily import GeneFamily +from ppanggolin.region import Region, Spot, Module from ppanggolin.metadata import Metadata @@ -83,6 +84,7 @@ def get_number_of_organisms(pangenome: Pangenome) -> int: return len(org_set) +# TODO Remove this function def fix_partitioned(pangenome_file: str): """ Fixes pangenomes with the 'partitionned' typo. @@ -101,7 +103,6 @@ def fix_partitioned(pangenome_file: str): del status_group._v_attrs.Partitionned h5f.close() - def get_status(pangenome: Pangenome, pangenome_file: Path): """ Checks which elements are already present in the file. @@ -171,14 +172,14 @@ def read_genedata(h5f: tables.File) -> dict: table = h5f.root.annotations.genedata genedata_id2genedata = {} for row in read_chunks(table, chunk=20000): - genedata = Genedata(start=row["start"], - stop=row["stop"], + genedata = Genedata(start=int(row["start"]), + stop=int(row["stop"]), strand=row["strand"].decode(), gene_type=row["gene_type"].decode(), - position=row["position"], + position=int(row["position"]), name=row["name"].decode(), product=row["product"].decode(), - genetic_code=row["genetic_code"]) + genetic_code=int(row["genetic_code"])) genedata_id = row["genedata_id"] genedata_id2genedata[genedata_id] = genedata return genedata_id2genedata @@ -243,7 +244,7 @@ def get_gene_sequences_from_file(pangenome_filename: str, file_obj: TextIO, list :param add: Add a prefix to sequence header :param disable_bar: Prevent to print disable progress bar """ - logging.getLogger("PPanGGOLiN").info(f"Extracting and writing CDS sequences from a {filename} file to a fasta file...") + logging.getLogger("PPanGGOLiN").info(f"Extracting and writing CDS sequences from a {pangenome_filename} file to a fasta file...") h5f = tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) table = h5f.root.geneSequences list_cds = set(list_cds) if list_cds is not None else None @@ -273,7 +274,11 @@ def read_organism(pangenome: Pangenome, org_name: str, contig_dict: dict, circul org = Organism(org_name) gene, gene_type = (None, None) for contig_name, gene_list in contig_dict.items(): - contig = org.get_contig(contig_name, is_circular=circular_contigs[contig_name]) + try: + contig = org.get(contig_name) + except KeyError: + contig = Contig(contig_name, is_circular=circular_contigs[contig_name]) + org.add(contig) for row in gene_list: if link: # if the gene families are already computed/loaded the gene exists. gene = pangenome.get_gene(row["ID"].decode()) @@ -300,7 +305,7 @@ def read_organism(pangenome: Pangenome, org_name: str, contig_dict: dict, circul gene.is_fragment = row["is_fragment"] gene.fill_parents(org, contig) if gene_type == "CDS": - contig.add_gene(gene) + contig[gene.start] = gene elif "RNA" in gene_type: contig.add_rna(gene) else: @@ -342,12 +347,16 @@ def read_gene_families(pangenome: Pangenome, h5f: tables.File, disable_bar: bool link = True if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"] else False for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene family", disable=disable_bar): - fam = pangenome.add_gene_family(row["geneFam"].decode()) + try: + fam = pangenome.get_gene_family(name=row["geneFam"].decode()) + except KeyError: + fam = GeneFamily(family_id=pangenome.max_fam_id, name=row["geneFam"].decode()) + pangenome.add_gene_family(fam) if link: # linking if we have loaded the annotations gene_obj = pangenome.get_gene(row["gene"].decode()) else: # else, no gene_obj = Gene(row["gene"].decode()) - fam.add_gene(gene_obj) + fam.add(gene_obj) pangenome.status["genesClustered"] = "Loaded" @@ -362,8 +371,8 @@ def read_gene_families_info(pangenome: Pangenome, h5f: tables.File, disable_bar: table = h5f.root.geneFamiliesInfo for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene family", disable=disable_bar): - fam = pangenome.add_gene_family(row["name"].decode()) - fam.add_partition(row["partition"].decode()) + fam = pangenome.get_gene_family(row["name"].decode()) + fam.partition = row["partition"].decode() fam.add_sequence(row["protein"].decode()) if h5f.root.status._v_attrs.Partitioned: @@ -387,7 +396,7 @@ def read_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: boo seqid2seq = read_sequences(h5f) for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): gene = pangenome.get_gene(row['gene'].decode()) - gene.add_dna(seqid2seq[row['seqid']]) + gene.add_sequence(seqid2seq[row['seqid']]) pangenome.status["geneSequences"] = "Loaded" @@ -406,11 +415,13 @@ def read_rgp(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): table = h5f.root.RGP for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="region", disable=disable_bar): - region = pangenome.get_region(row["RGP"].decode()) - region.append(pangenome.get_gene(row["gene"].decode())) - # order the genes properly in the regions - for region in pangenome.regions: - region.genes = sorted(region.genes, key=lambda x: x.position) # order the same way as on the contig + try: + region = pangenome.get_region(row["RGP"].decode()) + except KeyError: + region = Region(row["RGP"].decode()) + pangenome.add_region(region) + gene = pangenome.get_gene(row["gene"].decode()) + region.add(gene) pangenome.status["predictedRGP"] = "Loaded" @@ -425,13 +436,15 @@ def read_spots(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False table = h5f.root.spots spots = {} for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="spot", disable=disable_bar): - curr_spot = spots.get(row["spot"]) + curr_spot = spots.get(int(row["spot"])) if curr_spot is None: - curr_spot = Spot(row["spot"]) + curr_spot = Spot(int(row["spot"])) spots[row["spot"]] = curr_spot - curr_spot.add_region(pangenome.get_region(row["RGP"].decode())) + region = pangenome.get_region(row["RGP"].decode()) + curr_spot.add(region) curr_spot.spot_2_families() - pangenome.add_spots(spots.values()) + for spot in spots.values(): + pangenome.add_spot(spot) pangenome.status["spots"] = "Loaded" @@ -448,12 +461,14 @@ def read_modules(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = Fal table = h5f.root.modules modules = {} # id2mod for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="module", disable=disable_bar): - curr_module = modules.get(row['module']) + curr_module = modules.get(int(row['module'])) if curr_module is None: - curr_module = Module(row['module']) + curr_module = Module(int(row['module'])) modules[row["module"]] = curr_module - curr_module.add_family(pangenome.get_gene_family(row['geneFam'].decode())) - pangenome.add_modules(modules.values()) + family = pangenome.get_gene_family(row['geneFam'].decode()) + curr_module.add(family) + for module in modules.values(): + pangenome.add_module(module) pangenome.status["modules"] = "Loaded" diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index 6b89b204..ace14cb2 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -15,9 +15,8 @@ # local libraries from ppanggolin.pangenome import Pangenome -from ppanggolin.genome import Feature, Gene from ppanggolin.formats.writeMetadata import write_metadata, erase_metadata, write_metadata_status -from ppanggolin.genome import Feature +from ppanggolin.genome import Feature, Gene from ppanggolin.formats.readBinaries import read_genedata, Genedata @@ -158,17 +157,17 @@ def write_annotations(pangenome: Pangenome, h5f: tables.File, disable_bar: bool """ annotation = h5f.create_group("/", "annotations", "Annotations of the pangenome organisms") gene_table = h5f.create_table(annotation, "genes", gene_desc(*get_max_len_annotations(pangenome)), - expectedrows=len(pangenome.genes)) + expectedrows=pangenome.number_of_genes) - logging.getLogger("PPanGGOLiN").debug(f"Writing {len(pangenome.genes)} genes") + logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes} genes") genedata2gene = {} genedata_counter = 0 gene_row = gene_table.row - for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms(), unit="genome", disable=disable_bar): + for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar): for contig in org.contigs: - for gene in contig.genes + list(contig.RNAs): + for gene in list(contig.genes) + list(contig.RNAs): gene_row["organism"] = org.name gene_row["contig/name"] = contig.name gene_row["contig/is_circular"] = contig.is_circular @@ -268,12 +267,12 @@ def write_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: bo :param disable_bar: Disable progress bar """ gene_seq = h5f.create_table("/", "geneSequences", gene_sequences_desc(*get_gene_sequences_len(pangenome)), - expectedrows=len(pangenome.genes)) + expectedrows=pangenome.number_of_genes) # process sequences to save them only once seq2seqid = {} id_counter = 0 gene_row = gene_seq.row - for gene in tqdm(pangenome.genes, total=pangenome.number_of_gene(), unit="gene", disable=disable_bar): + for gene in tqdm(sorted(pangenome.genes, key=lambda x: x.ID), total=pangenome.number_of_genes, unit="gene", disable=disable_bar): curr_seq_id = seq2seqid.get(gene.dna) if curr_seq_id is None: curr_seq_id = id_counter @@ -347,10 +346,10 @@ def write_gene_fam_info(pangenome: Pangenome, h5f: tables.File, force: bool = Fa logging.getLogger("PPanGGOLiN").info("Erasing the formerly computed gene family representative sequences...") h5f.remove_node('/', 'geneFamiliesInfo') # erasing the table, and rewriting a new one. gene_fam_seq = h5f.create_table("/", "geneFamiliesInfo", gene_fam_desc(*get_gene_fam_len(pangenome)), - expectedrows=len(pangenome.gene_families)) + expectedrows=pangenome.number_of_gene_families) row = gene_fam_seq.row - for fam in tqdm(pangenome.gene_families, total=pangenome.number_of_gene_families(), + for fam in tqdm(pangenome.gene_families, total=pangenome.number_of_gene_families, unit="gene family", disable=disable_bar): row["name"] = fam.name row["protein"] = fam.sequence @@ -407,7 +406,7 @@ def write_gene_families(pangenome: Pangenome, h5f: tables.File, force: bool = Fa h5f.remove_node('/', 'geneFamilies') # erasing the table, and rewriting a new one. gene_families = h5f.create_table("/", "geneFamilies", gene_to_fam_desc(*get_gene_to_fam_len(pangenome))) gene_row = gene_families.row - for family in tqdm(pangenome.gene_families, total=pangenome.number_of_gene_families(), unit="gene family", + for family in tqdm(pangenome.gene_families, total=pangenome.number_of_gene_families, unit="gene family", disable=disable_bar): for gene in family.genes: gene_row["gene"] = gene.ID @@ -461,14 +460,13 @@ def write_graph(pangenome: Pangenome, h5f: tables.File, force: bool = False, dis logging.getLogger("PPanGGOLiN").info("Erasing the formerly computed edges") h5f.remove_node("/", "edges") edge_table = h5f.create_table("/", "edges", graph_desc(get_gene_id_len(pangenome)), - expectedrows=len(pangenome.edges)) + expectedrows=pangenome.number_of_edges) edge_row = edge_table.row - for edge in tqdm(pangenome.edges, total=pangenome.number_of_edge(), unit="edge", disable=disable_bar): - for gene_pairs in edge.organisms.values(): - for gene1, gene2 in gene_pairs: - edge_row["geneTarget"] = gene1.ID - edge_row["geneSource"] = gene2.ID - edge_row.append() + for edge in tqdm(pangenome.edges, total=pangenome.number_of_edges, unit="edge", disable=disable_bar): + for gene1, gene2 in edge.gene_pairs: + edge_row["geneTarget"] = gene1.ID + edge_row["geneSource"] = gene2.ID + edge_row.append() edge_table.flush() @@ -520,9 +518,9 @@ def write_rgp(pangenome: Pangenome, h5f: tables.File, force: bool = False, disab h5f.remove_node('/', 'RGP') rgp_table = h5f.create_table('/', 'RGP', rgp_desc(*get_rgp_len(pangenome)), - expectedrows=sum([len(region.genes) for region in pangenome.regions])) + expectedrows=sum([len(region) for region in pangenome.regions])) rgp_row = rgp_table.row - for region in tqdm(pangenome.regions, total=pangenome.number_of_rgp(), unit="region", disable=disable_bar): + for region in tqdm(pangenome.regions, total=pangenome.number_of_rgp, unit="region", disable=disable_bar): for gene in region.genes: rgp_row["RGP"] = region.name rgp_row["gene"] = gene.ID @@ -574,9 +572,9 @@ def write_spots(pangenome: Pangenome, h5f: tables.File, force: bool = False, dis h5f.remove_node("/", "spots") spot_table = h5f.create_table("/", "spots", spot_desc(get_spot_desc(pangenome)), - expectedrows=sum([len(spot.regions) for spot in pangenome.spots])) + expectedrows=sum([len(spot) for spot in pangenome.spots])) spot_row = spot_table.row - for spot in tqdm(pangenome.spots, total=pangenome.number_of_spots(), unit="spot", disable=disable_bar): + for spot in tqdm(pangenome.spots, total=pangenome.number_of_spots, unit="spot", disable=disable_bar): for region in spot.regions: spot_row["spot"] = spot.ID spot_row["RGP"] = region.name @@ -628,10 +626,10 @@ def write_modules(pangenome: Pangenome, h5f: tables.File, force: bool = False, d h5f.remove_node("/", "modules") mod_table = h5f.create_table('/', 'modules', mod_desc(get_mod_desc(pangenome)), - expectedrows=sum([len(mod.families) for mod in pangenome.modules])) + expectedrows=sum([len(mod) for mod in pangenome.modules])) mod_row = mod_table.row - for mod in tqdm(pangenome.modules, total=pangenome.number_of_modules(), unit="modules", disable=disable_bar): + for mod in tqdm(pangenome.modules, total=pangenome.number_of_modules, unit="modules", disable=disable_bar): for fam in mod.families: mod_row["geneFam"] = fam.name mod_row["module"] = mod.ID @@ -722,12 +720,12 @@ def getmin(arg: iter) -> float: else: info_group = h5f.create_group("/", "info", "Informations about the pangenome content") if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"]: - info_group._v_attrs.numberOfGenes = len(pangenome.genes) - info_group._v_attrs.numberOfOrganisms = len(pangenome.organisms) + info_group._v_attrs.numberOfGenes = pangenome.number_of_genes + info_group._v_attrs.numberOfOrganisms = pangenome.number_of_organisms if pangenome.status["genesClustered"] in ["Computed", "Loaded"]: - info_group._v_attrs.numberOfClusters = len(pangenome.gene_families) + info_group._v_attrs.numberOfClusters = pangenome.number_of_gene_families if pangenome.status["neighborsGraph"] in ["Computed", "Loaded"]: - info_group._v_attrs.numberOfEdges = len(pangenome.edges) + info_group._v_attrs.numberOfEdges = pangenome.number_of_edges if pangenome.status["partitioned"] in ["Computed", "Loaded"]: named_part_counter = Counter() subpart_counter = Counter() @@ -735,7 +733,7 @@ def getmin(arg: iter) -> float: part_set = set() for fam in pangenome.gene_families: named_part_counter[fam.named_partition] += 1 - part_distribs[fam.named_partition].append(len(fam.organisms) / len(pangenome.organisms)) + part_distribs[fam.named_partition].append(fam.number_of_organisms / pangenome.number_of_organisms) if fam.named_partition == "shell": subpart_counter[fam.partition] += 1 if fam.partition != "S_": @@ -757,12 +755,12 @@ def getmin(arg: iter) -> float: info_group._v_attrs.numberOfPartitions = len(part_set) info_group._v_attrs.numberOfSubpartitions = subpart_counter if pangenome.status["predictedRGP"] in ["Computed", "Loaded"]: - info_group._v_attrs.numberOfRGP = len(pangenome.regions) + info_group._v_attrs.numberOfRGP = pangenome.number_of_rgp if pangenome.status["spots"] in ["Computed", "Loaded"]: - info_group._v_attrs.numberOfSpots = len(pangenome.spots) + info_group._v_attrs.numberOfSpots = pangenome.number_of_spots if pangenome.status["modules"] in ["Computed", "Loaded"]: - info_group._v_attrs.numberOfModules = len(pangenome.modules) - info_group._v_attrs.numberOfFamiliesInModules = sum([len(mod.families) for mod in pangenome.modules]) + info_group._v_attrs.numberOfModules = pangenome.number_of_modules + info_group._v_attrs.numberOfFamiliesInModules = sum([len(mod) for mod in pangenome.modules]) info_group._v_attrs.parameters = pangenome.parameters # saving the pangenome parameters @@ -827,7 +825,7 @@ def part_spec(part: str) -> list: pangenome.compute_mod_bitarrays(part) return [popcount(module.bitarray) for module in pangenome.modules] - mod_fam = [len(module.families) for module in pangenome.modules] + mod_fam = [len(module) for module in pangenome.modules] info_group._v_attrs.StatOfFamiliesInModules = {"min": getmin(mod_fam), "max": getmax(mod_fam), "sd": getstdev(mod_fam), @@ -1000,11 +998,12 @@ def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable except AssertionError: raise AssertionError("Something REALLY unexpected and unplanned for happened here. " "Please post an issue on github with what you did to reach this error.") - else: + + if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded", "inFile"]: if pangenome.status["genomesAnnotated"] == "Computed": compression_filter = tables.Filters(complevel=1, shuffle=True, bitshuffle=True, complib='blosc:zstd') h5f = tables.open_file(filename, "w", filters=compression_filter) - logging.getLogger().info("Writing genome annotations...") + logging.getLogger("PPanGGOLiN").info("Writing genome annotations...") write_annotations(pangenome, h5f, disable_bar=disable_bar) @@ -1061,3 +1060,5 @@ def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable h5f.close() logging.getLogger("PPanGGOLiN").info(f"Done writing the pangenome. It is in file : {filename}") + + \ No newline at end of file diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 1af8ffa5..21c7670b 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -4,7 +4,6 @@ # default libraries import argparse import logging -import pdb from multiprocessing import get_context from collections import Counter, defaultdict from pathlib import Path @@ -63,7 +62,7 @@ def write_json_gene_fam(gene_fam: GeneFamily, json: TextIO): :param gene_fam: file-like object, compressed or not :param json: file-like object, compressed or not """ - json.write('{' + f'"id": "{gene_fam.name}", "nb_genes": {len(gene_fam.genes)}, ' + json.write('{' + f'"id": "{gene_fam.name}", "nb_genes": {len(gene_fam)}, ' f'"partition": "{gene_fam.named_partition}", "subpartition": "{gene_fam.partition}"' + '}') org_dict = {} name_counts = Counter() @@ -128,12 +127,12 @@ def write_json_edge(edge: Edge, json: TextIO): json.write(f'"weight": {len(edge.gene_pairs)}, "source": "{edge.source.name}", "target": "{edge.target.name}"') json.write(', "organisms": {') orgstr = [] - for org in edge.get_org_dict(): + for org in edge.organisms: orgstr.append('"' + org.name + '": [') genepairstr = [] - for genepair in edge.get_org_dict()[org]: - genepairstr.append('{"source": "' + genepair[0].ID + '", "target": "' + genepair[ - 1].ID + f'", "length": {genepair[0].start - genepair[1].stop}' + '}') + for gene_pair in edge.get_organism_genes_pairs(org): + genepairstr.append('{"source": "' + gene_pair[0].ID + '", "target": "' + gene_pair[ + 1].ID + f'", "length": {gene_pair[0].start - gene_pair[1].stop}' + '}') orgstr[-1] += ', '.join(genepairstr) + ']' json.write(', '.join(orgstr) + "}}") @@ -144,9 +143,9 @@ def write_json_edges(json): :param json: file-like object, compressed or not """ json.write(', "links": [') - edgelist = pan.edges - write_json_edge(edgelist[0], json) - for edge in edgelist[1:]: + edge_list = list(pan.edges) + write_json_edge(edge_list[0], json) + for edge in edge_list[1:]: json.write(", ") write_json_edge(edge, json) json.write(']') @@ -193,32 +192,32 @@ def write_gexf_header(gexf: TextIO, light: bool = True): gexf.write(' \n') gexf.write(' \n') - if len(pan.spots): + if pan.number_of_spots > 0: gexf.write(' \n') - if len(pan.modules): + if pan.number_of_modules > 0: gexf.write(' \n') shift = 14 - source_fields = {m.source : m.fields for f in pan.gene_families if len(list(f.metadata)) > 0 for m in f.metadata} + source_fields = {m.source: m.fields for f in pan.gene_families if len(list(f.metadata)) > 0 for m in f.metadata} for source_metadata_families in pan.metadata_sources("families"): for field in source_fields[source_metadata_families]: gexf.write(f' \n') shift += 1 if not light: - for org, orgIndex in index.items(): - gexf.write(f' \n') - + for org, org_idx in index.items(): + gexf.write(f' \n') gexf.write(' \n') gexf.write(' \n') gexf.write(' \n') if not light: - for org, orgIndex in index.items(): - gexf.write(f' \n') + for org, org_idx in index.items(): + gexf.write(f' \n') gexf.write(' \n') gexf.write(' \n') gexf.write(f' PPanGGOLiN {pkg_resources.get_distribution("ppanggolin").version}\n') gexf.write(' \n') + def write_gexf_nodes(gexf: TextIO, light: bool = True, soft_core: False = 0.95): """Write the node of pangenome graph in gexf file @@ -246,26 +245,26 @@ def write_gexf_nodes(gexf: TextIO, light: bool = True, soft_core: False = 0.95): gexf.write(f' \n') gexf.write(f' \n') - gexf.write(f' \n') + gexf.write(f' \n') gexf.write(' \n') - gexf.write(f' \n') + gexf.write(f' \n') gexf.write(f' \n') gexf.write(f' \n') gexf.write(f' \n') gexf.write(f' \n') gexf.write(f' \n') gexf.write(f' \n') + f'{"exact_accessory" if fam.number_of_organisms != pan.number_of_organisms else "exact_core"}" />\n') gexf.write(f' = (len(pan.organisms) * soft_core) else "soft_accessory"}"' + f'{"soft_core" if fam.number_of_organisms >= (pan.number_of_organisms * soft_core) else "soft_accessory"}"' f' />\n') gexf.write(f' \n') gexf.write(f' \n') - gexf.write(f' \n') - if len(pan.spots) > 0: - str_spot = "|".join([str(s) for s in list(fam.spot)]) + gexf.write(f' \n') + if pan.number_of_spots > 0: + str_spot = "|".join([str(s) for s in list(fam.spots)]) gexf.write(f' \n') - if len(pan.modules) > 0: + if pan.number_of_modules > 0: str_module = "|".join([str(m) for m in list(fam.modules)]) gexf.write(f' \n') shift = 14 @@ -275,7 +274,7 @@ def write_gexf_nodes(gexf: TextIO, light: bool = True, soft_core: False = 0.95): for m in fam.metadata: if m.source == source_metadata_families: for field in m.fields: - to_concat[field].append(str(m.get(field))) + to_concat[field].append(str(getattr(m, field))) for field in source_fields[source_metadata_families]: concatenated_fields = '|'.join(to_concat[field]) gexf.write(f' \n') @@ -303,13 +302,14 @@ def write_gexf_edges(gexf: TextIO, light: bool = True): for edge in pan.edges: gexf.write(f' \n') - gexf.write(f' \n') + f'{edge.source.ID}" target="{edge.target.ID}" weight="{edge.number_of_organisms}">\n') + gexf.write(f' \n') gexf.write(' \n') gexf.write(f' \n') if not light: - for org, genes in edge.get_org_dict().items(): - gexf.write(f' \n') + for org, genes_pairs in edge.get_organisms_dict().items(): + gexf.write( + f' \n') gexf.write(' \n') gexf.write(' \n') edgeids += 1 @@ -386,7 +386,7 @@ def write_matrix(output: Path, sep: str = ',', ext: str = 'csv', compress: bool '"Max group size nuc"', # 13 '"Avg group size nuc"'] # 14 + ['"' + str(org) + '"' for org in pan.organisms]) + "\n") # 15 - default_genes = ['""'] * len(pan.organisms) if gene_names else ["0"] * len(pan.organisms) + default_genes = ['""'] * pan.number_of_organisms if gene_names else ["0"] * pan.number_of_organisms org_index = pan.get_org_index() # should just return things for fam in pan.gene_families: genes = default_genes.copy() @@ -410,9 +410,9 @@ def write_matrix(output: Path, sep: str = ',', ext: str = 'csv', compress: bool matrix.write(sep.join(['"' + fam.name + '"', # 1 '"' + alt + '"', # 2 '"' + str(product.most_common(1)[0][0]) + '"', # 3 - '"' + str(len(fam.organisms)) + '"', # 4 - '"' + str(len(fam.genes)) + '"', # 5 - '"' + str(round(len(fam.genes) / len(fam.organisms), 2)) + '"', # 6 + '"' + str(fam.number_of_organisms) + '"', # 4 + '"' + str(len(fam)) + '"', # 5 + '"' + str(round(len(fam) / fam.number_of_organisms, 2)) + '"', # 6 '"NA"', # 7 '"NA"', # 8 '""', # 9 @@ -443,7 +443,7 @@ def write_gene_presence_absence(output: Path, compress: bool = False): matrix.write('\t'.join(['Gene'] + # 14 [str(org) for org in pan.organisms]) + "\n") # 15 - default_genes = ["0"] * len(pan.organisms) + default_genes = ["0"] * pan.number_of_organisms org_index = pan.get_org_index() # should just return things for fam in pan.gene_families: genes = default_genes.copy() @@ -473,12 +473,12 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, "\n") for fam in pan.gene_families: if fam.named_partition == "persistent": - mean_pres = len(fam.genes) / len(fam.organisms) + mean_pres = len(fam) / fam.number_of_organisms nb_multi = 0 for gene_list in fam.get_org_dict().values(): if len(gene_list) > 1: nb_multi += 1 - dup_ratio = nb_multi / len(fam.organisms) + dup_ratio = nb_multi / fam.number_of_organisms is_scm = False if dup_ratio < dup_margin: is_scm = True @@ -492,9 +492,9 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, soft = set() # could use bitarrays if speed is needed core = set() for fam in pan.gene_families: - if len(fam.organisms) >= pan.number_of_organisms() * soft_core: + if fam.number_of_organisms >= pan.number_of_organisms * soft_core: soft.add(fam) - if len(fam.organisms) == pan.number_of_organisms(): + if fam.number_of_organisms == pan.number_of_organisms: core.add(fam) with write_compressed_or_not(output / "organisms_statistics.tsv", compress) as outfile: @@ -506,11 +506,10 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, "nb_exact_core_genes", "nb_soft_core_genes", "completeness", "nb_single_copy_markers"]) + "\n") for org in pan.organisms: - fams = org.families nb_pers = 0 nb_shell = 0 nb_cloud = 0 - for fam in fams: + for fam in org.families: if fam.named_partition == "persistent": nb_pers += 1 elif fam.named_partition == "shell": @@ -536,14 +535,15 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, nb_gene_core += 1 completeness = "NA" if len(single_copy_markers) > 0: - completeness = round((len(fams & single_copy_markers) / len(single_copy_markers)) * 100, 2) + completeness = round(((org.number_of_families() + len(single_copy_markers)) / + len(single_copy_markers)) * 100, 2) outfile.write("\t".join(map(str, [org.name, - len(fams), + org.number_of_families(), nb_pers, nb_shell, nb_cloud, - len(core & fams), - len(soft & fams), + len(core) + org.number_of_families(), + len(soft) + org.number_of_families(), org.number_of_genes(), nb_gene_pers, nb_gene_shell, @@ -551,7 +551,7 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, nb_gene_core, nb_gene_soft, completeness, - len(fams & single_copy_markers)])) + "\n") + org.number_of_families() + len(single_copy_markers)])) + "\n") logging.getLogger("PPanGGOLiN").info("Done writing genome per genome statistics") @@ -580,7 +580,6 @@ def write_org_file(org: Organism, output: Path, compress: bool = False): nb_shell = 0 nb_cloud = 0 modules = None - rgp = None spot = None for neighbor in gene.family.neighbors: if neighbor.named_partition == "persistent": @@ -591,18 +590,16 @@ def write_org_file(org: Organism, output: Path, compress: bool = False): nb_cloud += 1 row = [gene.ID if gene.local_identifier == "" else gene.local_identifier, contig.name, gene.start, gene.stop, gene.strand, gene.family.name, - len(gene.family.get_genes_per_org(org)), gene.family.named_partition, + len(list(gene.family.get_genes_per_org(org))), gene.family.named_partition, nb_pers, nb_shell, nb_cloud] if needRegions: - if len(gene.RGP) > 0: - rgp = ','.join([str(region.name) for region in gene.RGP]) - row.append(rgp) + row.append(gene.RGP.name if gene.RGP is not None else gene.RGP) if needSpots: - if len(gene.family.spot) > 0: - spot = ','.join([str(s.ID) for s in gene.family.spot]) + if gene.family.number_of_spots > 0: + spot = ','.join([str(spot.ID) for spot in gene.family.spots]) row.append(spot) if needModules: - if len(gene.family.modules) > 0: + if gene.family.number_of_modules > 0: modules = ','.join(["module_" + str(module.ID) for module in gene.family.modules]) row.append(modules) outfile.write("\t".join(map(str, row)) + "\n") @@ -643,9 +640,9 @@ def write_parts(output: Path, soft_core: float = 0.95): part_sets[fam.named_partition].add(fam.name) if fam.partition.startswith("S"): part_sets[fam.partition].add(fam.name) - if len(fam.organisms) >= len(pan.organisms) * soft_core: + if fam.number_of_organisms >= pan.number_of_organisms * soft_core: part_sets["soft_core"].add(fam.name) - if len(fam.organisms) == len(pan.organisms): + if fam.number_of_organisms == pan.number_of_organisms: part_sets["exact_core"].add(fam.name) else: part_sets["exact_accessory"].add(fam.name) @@ -690,10 +687,11 @@ def write_regions(output: Path, compress: bool = False): fname = output / "plastic_regions.tsv" with write_compressed_or_not(fname, compress) as tab: tab.write("region\torganism\tcontig\tstart\tstop\tgenes\tcontigBorder\twholeContig\n") - regions = sorted(pan.regions, key=lambda x: (x.organism.name, x.contig.name, x.start)) + regions = sorted(pan.regions, key=lambda x: (x.organism.name, x.contig.name, x.starter.start)) for region in regions: - tab.write('\t'.join(map(str, [region.name, region.organism, region.contig, region.start, region.stop, - len(region.genes), region.is_contig_border, region.is_whole_contig])) + "\n") + tab.write('\t'.join(map(str, [region.name, region.organism, region.contig, region.starter.start, + region.stopper.stop, len(region), region.is_contig_border, + region.is_whole_contig])) + "\n") def summarize_spots(spots: set, output: Path, compress: bool = False, file_name="summarize_spots.tsv"): @@ -716,19 +714,18 @@ def r_and_s(value: float): with write_compressed_or_not(file_path, compress) as fout: fout.write("spot\tnb_rgp\tnb_families\tnb_unique_family_sets\tmean_nb_genes\t" "stdev_nb_genes\tmax_nb_genes\tmin_nb_genes\n") - for spot in sorted(spots, key=lambda x: len(x.regions), reverse=True): + for spot in sorted(spots, key=lambda x: len(x), reverse=True): tot_fams = set() - rgp_list = list(spot.regions) len_uniq_content = len(spot.get_uniq_content()) size_list = [] for rgp in spot.regions: - tot_fams |= rgp.families - size_list.append(len(rgp.genes)) + tot_fams |= set(rgp.families) + size_list.append(len(rgp)) mean_size = mean(size_list) stdev_size = stdev(size_list) if len(size_list) > 1 else 0 max_size = max(size_list) min_size = min(size_list) - fout.write("\t".join(map(r_and_s, [f"{spot.ID}", len(rgp_list), len(tot_fams), len_uniq_content, + fout.write("\t".join(map(r_and_s, [f"{str(spot)}", len(spot), len(tot_fams), len_uniq_content, mean_size, stdev_size, max_size, min_size])) + "\n") logging.getLogger("PPanGGOLiN").info(f"Done writing spots in '{file_path}'") @@ -753,7 +750,7 @@ def write_spots(output: Path, compress: bool = False): :param output: Path to output directory :param compress: Compress the file in .gz """ - if len(pan.spots) > 0: + if pan.number_of_spots > 0: spot2rgp(pan.spots, output, compress) summarize_spots(pan.spots, output, compress) @@ -769,7 +766,7 @@ def write_borders(output: Path, dup_margin: float = 0.05, compress: bool = False all_fams = set() with write_compressed_or_not(output / "spot_borders.tsv", compress) as fout: fout.write("spot_id\tnumber\tborder1\tborder2\n") - for spot in sorted(pan.spots, key=lambda x: len(x.regions), reverse=True): + for spot in sorted(pan.spots, key=lambda x: len(x), reverse=True): curr_borders = spot.borders(pan.parameters["spot"]["set_size"], multigenics) for c, border in curr_borders: famstring1 = ",".join([fam.name for fam in border[0]]) @@ -802,8 +799,8 @@ def write_module_summary(output: Path, compress: bool = False): for gene in family.genes: org_dict[gene.organism].add(gene) fout.write( - f"module_{mod.ID}\t{len(mod.families)}\t{len(org_dict)}\t{partition_counter.most_common(1)[0][0]}\t" - f"{round((sum([len(genes) for genes in org_dict.values()]) / len(org_dict)) / len(mod.families), 3)}\n") + f"module_{mod.ID}\t{len(mod)}\t{len(org_dict)}\t{partition_counter.most_common(1)[0][0]}\t" + f"{round((sum([len(genes) for genes in org_dict.values()]) / len(org_dict)) / len(mod), 3)}\n") fout.close() logging.getLogger("PPanGGOLiN").info(f"Done writing module summary: '{output.as_posix() + '/modules_summary.tsv'}'") @@ -839,9 +836,9 @@ def write_org_modules(output: Path, compress: bool = False): for mod in pan.modules: mod_orgs = set() for fam in mod.families: - mod_orgs |= fam.organisms + mod_orgs |= set(fam.organisms) for org in mod_orgs: - completion = round(len(org.families & mod.families) / len(mod.families), 2) + completion = round((org.number_of_families() + len(mod)) / len(mod), 2) fout.write(f"module_{mod.ID}\t{org.name}\t{completion}\n") fout.close() logging.getLogger("PPanGGOLiN").info( @@ -985,6 +982,7 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core needPartitions = True if gexf or light_gexf or json: needGraph = True + needRegions = True if pan.status["predictedRGP"] == "inFile" else False needSpots = True if pan.status["spots"] == "inFile" else False needModules = True if pan.status["modules"] == "inFile" else False if pangenome.status["metadata"]["families"] == "inFile": @@ -1005,7 +1003,8 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core check_pangenome_info(pangenome, need_annotations=needAnnotations, need_families=needFamilies, need_graph=needGraph, need_partitions=needPartitions, need_rgp=needRegions, need_spots=needSpots, - need_modules=needModules, need_metadata=needMetadata, metatype=metatype, sources=None, disable_bar=disable_bar) + need_modules=needModules, need_metadata=needMetadata, metatype=metatype, sources=None, + disable_bar=disable_bar) pan.get_org_index() # make the index because it will be used most likely with get_context('fork').Pool(processes=cpu) as p: if csv: @@ -1059,6 +1058,7 @@ def launch(args: argparse.Namespace): families_tsv=args.families_tsv, spots=args.spots, borders=args.borders, modules=args.modules, spot_modules=args.spot_modules, compress=args.compress, disable_bar=args.disable_prog_bar) + def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: """ Subparser to launch PPanGGOLiN in Command line @@ -1122,6 +1122,7 @@ def parser_flat(parser: argparse.ArgumentParser): help="writes 3 files comparing the presence of modules within spots") optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") + if __name__ == '__main__': """To test local change and allow using debugger""" from ppanggolin.utils import set_verbosity_level, add_common_arguments @@ -1133,4 +1134,4 @@ def parser_flat(parser: argparse.ArgumentParser): parser_flat(main_parser) add_common_arguments(main_parser) set_verbosity_level(main_parser.parse_args()) - launch(main_parser.parse_args()) \ No newline at end of file + launch(main_parser.parse_args()) diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py index 0f08bde7..6c7363f5 100644 --- a/ppanggolin/formats/writeMSA.py +++ b/ppanggolin/formats/writeMSA.py @@ -35,7 +35,7 @@ def is_single_copy(family: GeneFamily, dup_margin: float = 0.95) -> bool: for gene_list in family.get_org_dict().values(): if len(gene_list) > 1: nb_multi += 1 - dup_ratio = nb_multi / len(family.organisms) + dup_ratio = nb_multi / family.number_of_organisms if dup_ratio < dup_margin: return True return False @@ -55,10 +55,10 @@ def get_families_to_write(pangenome: Pangenome, partition_filter: str = "core", :return: set of families unique to one partition """ families = set() - nb_org = pangenome.number_of_organisms() + nb_org = pangenome.number_of_organisms if partition_filter == "all": - return set(pangenome.gene_families) + return pangenome.gene_families if partition_filter in ["persistent", "shell", "cloud"]: for family in pangenome.gene_families: if family.named_partition == partition_filter: @@ -70,7 +70,7 @@ def get_families_to_write(pangenome: Pangenome, partition_filter: str = "core", elif partition_filter in ["core", "accessory", "softcore"]: if partition_filter == "core": for family in pangenome.gene_families: - if len(family.organisms) == nb_org: + if family.number_of_organisms == nb_org: if single_copy: if is_single_copy(family, dup_margin): families.add(family) @@ -78,7 +78,7 @@ def get_families_to_write(pangenome: Pangenome, partition_filter: str = "core", families.add(family) elif partition_filter == "accessory": for family in pangenome.gene_families: - if len(family.organisms) < nb_org: + if family.number_of_organisms < nb_org: if single_copy: if is_single_copy(family, dup_margin): families.add(family) @@ -86,7 +86,7 @@ def get_families_to_write(pangenome: Pangenome, partition_filter: str = "core", families.add(family) elif partition_filter == "softcore": for family in pangenome.gene_families: - if len(family.organisms) >= nb_org * soft_core: + if family.number_of_organisms >= nb_org * soft_core: if single_copy: if is_single_copy(family, dup_margin): families.add(family) diff --git a/ppanggolin/formats/writeMetadata.py b/ppanggolin/formats/writeMetadata.py index 974a6a02..1d0ffa34 100644 --- a/ppanggolin/formats/writeMetadata.py +++ b/ppanggolin/formats/writeMetadata.py @@ -114,7 +114,7 @@ def get_metadata_len(select_elem: List[Module], source: str) -> Tuple[Dict[str, else: raise Exception("Unexpected attribute. A recent change could create this error." " Please report the error on our github.") - for metadata in element.get_source(source=source): + for metadata in element.get_metadata_by_source(source): for attr, value in ((k, v) for k, v in metadata.__dict__.items() if k != "source"): if isinstance(value, bytes): value = value.decode('UTF-8') @@ -160,7 +160,7 @@ def write_metadata_metatype(h5f: tables.File, source: str, metatype: str, source_table = h5f.create_table(metatype_group, source, desc_metadata(*meta_len[:-1]), expectedrows=meta_len[-1]) meta_row = source_table.row for element in tqdm(select_elements, unit=metatype, desc=f'Source = {source}', disable=disable_bar): - for metadata in element.get_source(source=source): + for metadata in element.get_metadata_by_source(source): for desc in source_table.colnames: if desc == "ID": if hasattr(element, 'name') and len(element.name) > 0: diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index eb801c93..1088782b 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -109,14 +109,14 @@ def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_c elif partition == "softcore": logging.getLogger("PPanGGOLiN").info( f"Writing the {type_name} in {partition} genome, that are present in more than {soft_core} of genomes") - threshold = pangenome.number_of_organisms() * soft_core + threshold = pangenome.number_of_organisms * soft_core for fam in pangenome.gene_families: - if len(fam.organisms) >= threshold: + if fam.number_of_organisms >= threshold: genefams.add(fam) elif partition == "core": logging.getLogger("PPanGGOLiN").info(f"Writing the representative {type_name} of the {partition} gene families...") for fam in pangenome.gene_families: - if len(fam.organisms) == pangenome.number_of_organisms(): + if fam.number_of_organisms == pangenome.number_of_organisms: genefams.add(fam) elif "module_" in partition: logging.getLogger("PPanGGOLiN").info(f"Writing the representation {type_name} of {partition} gene families...") @@ -325,7 +325,7 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa loaded_genome = region.organism.name genome_sequence = read_genome_file(org_dict, loaded_genome) fasta.write(f">{region.name}\n") - fasta.write(write_spaced_fasta(genome_sequence[region.contig.name][region.start:region.stop], 60)) + fasta.write(write_spaced_fasta(genome_sequence[region.contig.name][region.starter.start:region.stopper.stop], 60)) logging.getLogger("PPanGGOLiN").info(f"Done writing the regions nucleotide sequences: '{outname}'") diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py index 4b33c622..0bf13aef 100644 --- a/ppanggolin/geneFamily.py +++ b/ppanggolin/geneFamily.py @@ -7,8 +7,7 @@ import logging # installed libraries -from typing import Dict, List, Set - +from typing import Dict, Generator, Set import gmpy2 # local libraries @@ -20,51 +19,179 @@ class GeneFamily(MetaFeatures): """ This represents a single gene family. It will be a node in the pangenome graph, and be aware of its genes and edges. + Methods: + - named_partition: returns a meaningful name for the partition associated with the family. + - neighbors: returns all the GeneFamilies that are linked with an edge. + - edges: returns all Edges that are linked to this gene family. + - genes: returns all the genes associated with the family. + - organisms: returns all the Organisms that have this gene family. + - spots: returns all the spots associated with the family. + - modules: returns all the modules associated with the family. + - number_of_neighbor: returns the number of neighbor GeneFamilies. + - number_of_edges: returns the number of edges. + - number_of_genes: returns the number of genes. + - number_of_organisms: returns the number of organisms. + - number_of_spots: returns the number of spots. + - number_of_modules: returns the number of modules. + - set_edge: sets an edge between the current family and a target family. + - add_sequence: assigns a protein sequence to the gene family. + - add_gene: adds a gene to the gene family and sets the gene's family accordingly. + - add_spot: adds a spot to the gene family. + - add_module: adds a module to the gene family. + - Mk_bitarray: produces a bitarray representing the presence/absence of the family in the pangenome using the provided index. + - get_org_dict: returns a dictionary of organisms as keys and sets of genes as values. + - get_genes_per_org: returns the genes belonging to the gene family in the given organism. - :param family_id: The internal identifier to give to the gene family - :type family_id: any - :param name: The name of the gene family (to be printed in output files) - :type name: str + Fields: + - name: the name of the gene family. + - ID: the internal identifier of the gene family. + - removed: a boolean indicating whether the family has been removed from the main graph. + - sequence: the protein sequence associated with the family. + - Partition: the partition associated with the family. """ def __init__(self, family_id: int, name: str): + # TODO edges as genes in contig to get and set + """Constructor method + :param family_id: The internal identifier to give to the gene family + :type family_id: any + :param name: The name of the gene family (to be printed in output files) + :type name: str + """ + assert isinstance(family_id, int), "GeneFamily object id should be an integer" + assert isinstance(name, str), "GeneFamily object name should be a string" + assert name != '', "GeneFamily object cannot be created with an empty name" + super().__init__() self.name = str(name) self.ID = family_id self._edges = {} self._genePerOrg = defaultdict(set) - self.genes = set() + self._genes_getter = {} self.removed = False # for the repeated family not added in the main graph self.sequence = "" self.partition = "" - self.spot = set() - self.modules = set() + self._spots = set() + self._modules = set() self.bitarray = None - def add_sequence(self, seq: str): - """Assigns a protein sequence to the gene family. + def __repr__(self) -> str: + """Family representation + """ + return f"{self.ID}: {self.name}" + + + + def __len__(self) -> int: + return len(self._genes_getter) - :param seq: the sequence to add to the gene family + def __setitem__(self, identifier: str, gene: Gene): + """ Set gene to Gene Family + + :param identifier: ID of the gene + :param gene: Gene object to add + + :raises TypeError: If the gene is not instance Gene + :raises TypeError: If the identifier is not instance string + :raises ValueError: If a gene in getter already exists at the name """ - self.sequence = seq + # TODO look at change start for position + + if not isinstance(gene, Gene): + raise TypeError(f"'Gene' type was expected but you provided a '{type(gene)}' type object") + if not isinstance(identifier, str): + raise TypeError(f"Gene ID should be a string. You provided a '{type(identifier)}' type object") + if identifier in self._genes_getter: + raise KeyError(f"Gene with name {identifier} already exists in the gene family") + self._genes_getter[identifier] = gene + + # TODO define eq function + + # retrieve gene by start position + def __getitem__(self, identifier: str) -> Gene: + """Get the gene for the given name + + :param identifier: ID of the gene in the gene family - def add_partition(self, partition: str): - """Assigns a partition to the gene family. It should be the raw partition name provided by NEM. + :return: Wanted gene - :param partition: The partition + :raises TypeError: If the identifier is not instance string + :raises KeyError: Gene with the given identifier does not exist in the contig """ - self.partition = partition + if not isinstance(identifier, str): + raise TypeError(f"Gene ID should be a string. You provided a '{type(identifier)}' type object") + try: + return self._genes_getter[identifier] + except KeyError: + raise KeyError(f"Gene with the ID: {identifier} does not exist in the family") + + def __delitem__(self, identifier: str): + """Remove the gene for the given name in the gene family + + :param position: ID of the gene in the family + + :raises TypeError: If the identifier is not instance string + :raises KeyError: Gene with the given identifier does not exist in the contig + """ + if not isinstance(identifier, str): + raise TypeError(f"Gene ID should be a string. You provided a '{type(identifier)}' type object") + try: + del self._genes_getter[identifier] + except KeyError: + raise KeyError(f"Gene with the name: {identifier} does not exist in the family") + + def add(self, gene: Gene): + """Add a gene to the gene family, and sets the gene's :attr:family accordingly. + + :param gene: The gene to add + + :raises TypeError: If the provided `gene` is of the wrong type + """ + if not isinstance(gene, Gene): + raise TypeError(f"'Gene' type object was expected, but '{type(gene)}' type object was provided.") + self[gene.ID] = gene + gene.family = self + if gene.organism is not None: + self._genePerOrg[gene.organism].add(gene) + + def get(self, identifier: str) -> Gene: + """Get a gene by its name + + :param identifier: ID of the gene + + :return: Wanted gene + + :raises TypeError: If the identifier is not instance string + """ + if not isinstance(identifier, str): + raise TypeError(f"Gene ID should be a string. You provided a '{type(identifier)}' type object") + return self[identifier] + + def remove(self, identifier): + """Remove a gene by its name + + :param identifier: Name of the gene + + :return: Wanted gene + + :raises TypeError: If the identifier is not instance string + """ + if not isinstance(identifier, str): + raise TypeError(f"Gene ID should be a string. You provided a '{type(identifier)}' type object") + del self[identifier] + + #TODO define __eq__ @property def named_partition(self) -> str: """Reads the partition attribute and returns a meaningful name - :raises Exception: If the gene family has no partition assigned + :return: The partition name of the gene family - :return: the partition name of the gene family + :raises ValueError: If the gene family has no partition assigned """ if self.partition == "": - raise Exception("The gene family has not beed associated to a partition") + raise ValueError("The gene family has not beed associated to a partition") if self.partition.startswith("P"): return "persistent" elif self.partition.startswith("C"): @@ -74,19 +201,135 @@ def named_partition(self) -> str: else: return "undefined" - def add_gene(self, gene: Gene): - """Add a gene to the gene family, and sets the gene's :attr:family accordingly. + @property + def neighbors(self) -> Generator[GeneFamily, None, None]: + """Returns all the GeneFamilies that are linked with an edge - :param gene: the gene to add + :return: Neighbors + """ + for neighbor in self._edges.keys(): + yield neighbor - :raises TypeError: If the provided `gene` is of the wrong type + @property + def edges(self) -> Generator[Edge, None, None]: + """Returns all Edges that are linked to this gene family + + :return: Edges of the gene family """ - if not isinstance(gene, Gene): - raise TypeError(f"'Gene' type object was expected, but '{type(gene)}' type object was provided.") - self.genes.add(gene) - gene.family = self - if hasattr(gene, "organism"): - self._genePerOrg[gene.organism].add(gene) + for edge in self._edges.values(): + yield edge + + @property + def genes(self): + """Return all the genes belonging to the family + + :return: Generator of genes + """ + for gene in self._genes_getter.values(): + yield gene + + @property + def organisms(self) -> Generator[Organism, None, None]: + """Returns all the Organisms that have this gene family + + :return: Organisms that have this gene family + """ + if len(self._genePerOrg) == 0: + _ = self.get_org_dict() + for org in self._genePerOrg.keys(): + yield org + + @property + def spots(self) -> Generator[Spot, None, None]: + """Return all the spots belonging to the family + + :return: Generator of spots + """ + for spot in self._spots: + yield spot + + @property + def modules(self) -> Generator[Module, None, None]: + """Return all the modules belonging to the family + + :return: Generator of modules + """ + for module in self._modules: + yield module + @property + def number_of_neighbors(self) -> int: + """Get the number of neighbor for the current gene family + """ + return len(self._edges.keys()) + + @property + def number_of_edges(self) -> int: + """Get the number of edges for the current gene family + """ + return len(self._edges.values()) + + @property + def number_of_genes(self) -> int: + """Get the number of genes for the current gene family + """ + return len(self._genes) + + @property + def number_of_organisms(self) -> int: + """Get the number of organisms for the current gene family + """ + if len(self._genePerOrg) == 0: + _ = self.get_org_dict() + return len(self._genePerOrg.keys()) + + @property + def number_of_spots(self) -> int: + """Get the number of spots for the current gene family + """ + return len(self._spots) + + @property + def number_of_modules(self) -> int: + """Get the number of modules for the current gene family + """ + return len(self._modules) + + def set_edge(self, target: GeneFamily, edge: Edge): + """Set the edge between the gene family and another one + + :param target: Neighbor family + :param edge: Edge connecting families + """ + self._edges[target] = edge + + def add_sequence(self, seq: str): + """Assigns a protein sequence to the gene family. + + :param seq: The sequence to add to the gene family + """ + assert isinstance(seq, str), "Sequence must be a string" + + self.sequence = seq + + def add_spot(self, spot: Spot): + """Add the given spot to the family + + :param spot: Spot belonging to the family + """ + from ppanggolin.region import Spot # prevent circular import error + if not isinstance(spot, Spot): + raise TypeError(f"A spot object is expected, you give a {type(spot)}") + self._spots.add(spot) + + def add_module(self, module: Module): + """Add the given module to the family + + :param module: Module belonging to the family + """ + from ppanggolin.region import Module # prevent circular import error + if not isinstance(module, Module): + raise TypeError(f"A module object is expected, you give a {type(module)}") + self._modules.add(module) def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): """Produces a bitarray representing the presence/absence of the family in the pangenome using the provided index @@ -114,60 +357,25 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): def get_org_dict(self) -> Dict[Organism, Set[Gene]]: """Returns the organisms and the genes belonging to the gene family - :return: a dictionnary of organism as key and set of genes as values + :return: A dictionnary of organism as key and set of genes as values """ - try: - return self._genePerOrg - except AttributeError: + if len(self._genePerOrg) == 0: for gene in self.genes: + if gene.organism is None: + raise AttributeError(f"Gene: {gene.name} is not fill with organism") self._genePerOrg[gene.organism].add(gene) - return self._genePerOrg - except Exception: - raise Exception("An unexpected error occurs. Please report in our GitHub") + return self._genePerOrg - def get_genes_per_org(self, org: Organism) -> Set[Gene]: + def get_genes_per_org(self, org: Organism) -> Generator[Gene, None, None]: """Returns the genes belonging to the gene family in the given Organism :param org: Organism to look for - :return: a set of gene(s) + :return: A set of gene(s) """ - try: - return self._genePerOrg[org] - except AttributeError: - for gene in self.genes: - self._genePerOrg[gene.organism].add(gene) - return self._genePerOrg[org] - except Exception: - raise Exception("An unexpected error occurs. Please report in our GitHub") - - @property - def neighbors(self) -> Set[GeneFamily]: - """Returns all the GeneFamilies that are linked with an edge - - :return: Neighbors - """ - return set(self._edges.keys()) - - @property - def edges(self) -> List[Edge]: - """Returns all Edges that are linked to this gene family - - :return: Edges of the gene family - """ - return list(self._edges.values()) - - @property - def organisms(self) -> Set[Organism]: - """Returns all the Organisms that have this gene family - - :return: Organisms that have this gene family - """ - try: - return set(self._genePerOrg.keys()) - except AttributeError: # then the genes have been added before they had organisms - for gene in self.genes: - self._genePerOrg[gene.organism].add(gene) - return set(self._genePerOrg.keys()) - except Exception: - raise Exception("An unexpected error occurs. Please report in our GitHub") + if len(self._genePerOrg) == 0: + _ = self.get_org_dict() + if org not in self._genePerOrg: + raise KeyError(f"Organism don't belong to the gene family: {self.name}") + for gene in self._genePerOrg[org]: + yield gene \ No newline at end of file diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 717bf515..0d472fea 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -5,7 +5,7 @@ # installed libraries import logging -from typing import Dict, Iterator +from typing import Dict, Generator, List import gmpy2 @@ -16,10 +16,33 @@ class Feature(MetaFeatures): """This is a general class representation of Gene, RNA - :param identifier: Identifier of the feature given by PPanGGOLiN + Methods: + - fill_annotations: fills general annotation for child classes. + - fill_parents: associates the object to an organism and a contig. + - Add_sequence: adds a sequence to the feature. + + Fields: + - ID: Identifier of the feature given by PPanGGOLiN. + - is_fragment: Boolean value indicating whether the feature is a fragment or not. + - type: Type of the feature. + - start: Start position of the feature. + - stop: Stop position of the feature. + - strand: Strand associated with the feature. + - product: Associated product of the feature. + - name: Name of the feature. + - local_identifier: Identifier provided by the original file. + - organism: Parent organism of the feature. + - contig: Parent contig of the feature. + - dna: DNA sequence of the feature. """ - def __init__(self, identifier: str): + """Constructor Method + + :param identifier: Identifier of the feature + """ + assert isinstance(identifier, str), "Expected identifier should be a string" + if identifier == '': + raise ValueError("Identifier should not be empty") super().__init__() self.ID = identifier self.is_fragment = False @@ -30,17 +53,63 @@ def __init__(self, identifier: str): self.product = None self.name = None self.local_identifier = None - self.organism = None - self.contig = None + self._organism = None + self._contig = None self.dna = None - @property - def length(self) -> int: + def __str__(self) -> str: + return str(self.ID) + + def __len__(self) -> int: """Return gene length :return: gene length + + :raises ValueError: If start or stop are not defined in gene + """ + if self.start is not None: + if self.stop is not None: + return self.stop - self.start + 1 + else: + raise ValueError("Stop is not known") + else: + raise ValueError("Start is not known") + + @property + def organism(self) -> Organism: + """Return organism that Feature belongs to. + + :return: Organism of the feature + """ + return self._organism + + @organism.setter + def organism(self, organism: Organism): + """Set the organism to the Feature + + :param organism: Organism belonging to the feature + """ + if not isinstance(organism, Organism): + raise TypeError(f'Expected type Organism, got {type(organism)}') + self._organism = organism + + @property + def contig(self) -> Contig: + """Return contig that Feature belongs to. + + :return: Contig of the feature + """ + return self._contig + + @contig.setter + def contig(self, contig: Contig): + """Set the contig to the Feature + + :param contig: Contig linked to the feature """ - return self.stop - self.start + if not isinstance(contig, Contig): + raise TypeError(f'Expected type Contig, got {type(contig)}') + self._contig = contig def fill_annotations(self, start: int, stop: int, strand: str, gene_type: str = "", name: str = "", product: str = "", local_identifier: str = ""): @@ -54,34 +123,60 @@ def fill_annotations(self, start: int, stop: int, strand: str, gene_type: str = :param name: Name of the feature :param product: Associated product :param local_identifier: Identifier provided by the original file + + :raises TypeError: If attribute value does not correspond to the expected type + :raises ValueError: If strand is not '+' or '-' """ - self.start = start if isinstance(start, int) else int(start) - self.stop = stop if isinstance(stop, int) else int(stop) - self.type = gene_type + if not isinstance(start, int): + raise TypeError("Start should be int") + if not isinstance(stop, int): + raise TypeError("Stop should be int") + if not isinstance(strand, str): + raise TypeError("Strand should be str") + if not isinstance(gene_type, str): + raise TypeError("Gene type should be str") + if not isinstance(name, str): + raise TypeError("Name should be str") + if not isinstance(product, str): + raise TypeError("Product should be str") + if not isinstance(local_identifier, str): + raise TypeError("Local identifier should be str") + if strand not in ["+", "-"]: + raise ValueError("Strand should be + or -") + self.start = start + self.stop = stop self.strand = strand + self.type = gene_type self.product = product self.name = name self.local_identifier = local_identifier - def fill_parents(self, organism: Organism, contig: Contig): + def fill_parents(self, organism: Organism = None, contig: Contig = None): """ Associate object to an organism and a contig :param organism: Parent organism :param contig: Parent contig """ - self.organism = organism - self.contig = contig + if organism is not None: + # TODO test type + self.organism = organism + if contig is not None: + self.contig = contig + else: + if contig is not None: + self.contig = contig + else: + raise AssertionError("You should provide at least organism or contig") - def add_dna(self, dna): - """ Add DNA sequence to feature + def add_sequence(self, sequence): + """Add a sequence to feature - :param dna: DNA sequence + :param sequence: Sequence corresponding to the feature - :raise TypeError: DNA sequence must be a string + :raise AssertionError: Sequence must be a string """ - if not isinstance(dna, str): - raise TypeError(f"'str' type was expected but you provided a '{type(dna)}' type object") - self.dna = dna + assert isinstance(sequence, str), f"'str' type was expected but you provided a '{type(sequence)}' type object" + self.dna = sequence class RNA(Feature): @@ -95,43 +190,91 @@ def __init__(self, rna_id: str): class Gene(Feature): - """Save gene from genome as an Object with some information for Pangenome - - :param gene_id: Identifier of the gene + """Save gene from the genome as an Object with some information for Pangenome + + Methods: + - fill_annotations: fills general annotation for the gene object and adds additional attributes such as + position and genetic code. + - Add_protein: adds the protein sequence corresponding to the translated gene to the object. + + Fields: + - position: the position of the gene in the genome. + - family: the family that the gene belongs to. + - RGP: a set of resistance gene profiles associated with the gene. + - genetic_code: the genetic code associated with the gene. + - Protein: the protein sequence corresponding to the translated gene. """ - def __init__(self, gene_id: str): + """Constructor method + + :param gene_id: Identifier of the gene + """ super().__init__(gene_id) self.position = None - self.family = None - self.RGP = set() + self._family = None + self._RGP = None self.genetic_code = None self.protein = None - def __str__(self) -> str: - return str(self.ID) + @property + def family(self): + """Return GeneFamily that Gene belongs to. - def fill_annotations(self, start: int, stop: int, strand: str, gene_type: str = "", name: str = "", - product: str = "", local_identifier: str = "", position: int = None, genetic_code: int = 11): + :return: Gene family of the gene + :rtype: GeneFamily """ - Fill Gene annotation provide by PPanGGOLiN dependencies + return self._family - :param start: Start position - :param stop: Stop position - :param strand: associated strand - :param gene_type: Type of the gene - :param name: Gene name - :param product: Associated product - :param local_identifier: Identifier provided by the original file - :param position: Gene localisation in genome + @family.setter + def family(self, family): + """Set the GeneFamily blonging to the gene + + :param family: Gene family linked to the gene + """ + from ppanggolin.geneFamily import GeneFamily + if not isinstance(family, GeneFamily): + raise TypeError(f'Expected type Organism, got {type(family)}') + self._family = family + + @property + def RGP(self): + """Return the RGP that gene belongs to + + :return: RGP fo the Gene + :rtype: Region + """ + return self._RGP + + @RGP.setter + def RGP(self, region): + """Set the Region blonging to the gene + + :param region: Region linked to the gene + """ + from ppanggolin.region import Region + if not isinstance(region, Region): + raise TypeError(f'Expected type Organism, got {type(region)}') + self._RGP = region + + def fill_annotations(self, position: int = None, genetic_code: int = 11, **kwargs): + """Fill Gene annotation provide by PPanGGOLiN dependencies + + :param position: Gene localization in genome :param genetic_code: Genetic code associated to gene + :param kwargs: look at Feature.fill_annotations methods + + :raises TypeError: If position or genetic code value is not instance integers """ - super().fill_annotations(start, stop, strand, gene_type, name, product, local_identifier) + super().fill_annotations(**kwargs) + if position is not None and not isinstance(position, int): + raise TypeError("position should be an integer") + if not isinstance(genetic_code, int): + raise TypeError("Genetic code should be an integer") self.position = position self.genetic_code = genetic_code def add_protein(self, protein: str): - """ Add protein sequence corresponding to translated gene + """Add a protein sequence corresponding to translated gene :param protein: Protein sequence @@ -145,150 +288,408 @@ def add_protein(self, protein: str): class Contig: """ Describe the contig content and some information - - :param name: Name of the contig - :param is_circular: save if the contig is circular + Methods: + - genes: Returns a list of gene objects present in the contig. + - add_rna: Adds an RNA object to the contig. + - add_gene: Adds a gene object to the contig. + + Fields: + - name: Name of the contig. + - is_circular: Boolean value indicating whether the contig is circular or not. + - RNAs: Set of RNA annotations present in the contig. """ + def __init__(self, name: str, is_circular: bool = False): + """Constructor method + + :param name: Name of the contig + :param is_circular: saves if the contig is circular + """ self.name = name self.is_circular = is_circular - self.RNAs = set() # saving the rna annotations. We're not using them in the vast majority of cases. - self._genes_start = {} + self._rna_getter = set() # Saving the rna annotations. We're not using them in the vast majority of cases. + self._genes_getter = {} self._genes_position = [] + self._organism = None - @property - def genes(self) -> list: - """ Give the gene content of the contig + def __str__(self) -> str: + return self.name + + def __setitem__(self, start: int, gene: Gene): + """ Set gene to Contig + + :param start: Start position of the gene + :param gene: Gene object to add - :return: list of gene in contig + :raises TypeError: If the gene is not instance Gene + :raises ValueError: If a gene in getter already exists at the start + :raises AttributeError: If the gene position in the contig is not fill """ - return self._genes_position + # TODO look at change start for position - def __str__(self) -> str: - return self.name + if not isinstance(gene, Gene): + raise TypeError(f"'Gene' type was expected but you provided a '{type(gene)}' type object") + if start in self._genes_getter: + raise ValueError(f"Gene with start position {start} already exists in the contig") + if gene.position is None: + raise AttributeError("The gene object needs to have its position in the contig filled before adding it") + # Adding empty values. + # They should be filled by the end of the parsing. + # Doing this because genes are not always met in order. + self._genes_position.extend([None] * (gene.position - len(self._genes_position) + 1)) + self._genes_position[gene.position] = gene + self._genes_getter[gene.start] = gene - def __iter__(self): - return iter(self.genes) + # TODO define eq function # retrieve gene by start position - def __getitem__(self, index: int): - gene = self._genes_start.get(index) - if not gene: - if not isinstance(index, int): - raise TypeError(f"Expected type is int, given type was '{type(index)}'") - raise IndexError(f"No gene start at the given position {index}") + def __getitem__(self, position: int) -> Gene: + """Get the gene for the given position + + :param position: Position of the gene in the contig + + :return: Wanted gene for the position + + :raises TypeError: If position is not an integer + """ + if not isinstance(position, int): + raise TypeError(f"Expected type is int, given type was '{type(position)}'") + try: + return self._genes_position[position] + except KeyError: + raise KeyError("Position of the gene in the contig does not exist") + + def __delitem__(self, position): + """Remove the gene for the given position in the contig + + :param position: Position of the gene in the contig + + :raises KeyError: Gene at the given position does not exist in the contig + """ + if not isinstance(position, int): + raise TypeError(f"Expected type is int, given type was '{type(position)}'") + try: + del self._genes_position[position] + except KeyError: + raise KeyError("Position of the gene in the contig does not exist") + + def add(self, gene: Gene): + """Add a gene to the contig + + :param gene: Gene to add + + :raises TypeError: Region is not an instance Region + """ + if not isinstance(gene, Gene): + raise TypeError(f"Unexpected class / type for {type(gene)} when adding it to a contig") + if gene.start is None: + raise AttributeError(f'Gene {gene.name} is not fill with start') + if gene.position is None: + raise AttributeError(f'Gene {gene.name} is not fill with position') + self[gene.start] = gene + + def get(self, position: int) -> Gene: + """Get a gene by its position + + :param position: Position of the gene in the contig + + :return: Wanted gene + + :raises TypeError: Position is not an integer + """ + if not isinstance(position, int): + raise TypeError(f"Position to get gene must be an integer. The provided type was {type(position)}") + gene = self[position] + if gene is None: + logging.getLogger("PPanGGOLiN").debug("Given position result with a None Gene") return gene + def remove(self, position): + """Remove a gene by its position + + :param position: Position of the gene in the contig + + :raises TypeError: Position is not an integer + """ + if not isinstance(position, int): + raise TypeError(f"Position to get gene must be an integer. The provided type was {type(position)}") + del self[position] + + def get_genes(self, begin: int, end: int) -> List[Gene]: + """Gets a list of genes within a range + + :param begin: Position of the first gene to retrieve + :param end: Position of the last gene to not retrieve + + :return: List of genes between begin and end position + + :raises TypeError: If begin or end is not an integer + :raises ValueError: If begin position is greater than end positon + """ + if not isinstance(begin, int) or not isinstance(end, int): + raise TypeError(f"Expected type is int, given type was '{type(begin)}, {type(end)}'") + if end < begin: + raise ValueError("End position is lower than begin position") + else: + return self._genes_position[begin: end] + + @property + def number_of_genes(self) -> int: + return len(self._genes_position) + + @property + def genes(self) -> Generator[Gene, None, None]: + """ Give the gene content of the contig + + :return: Generator of genes in contig + """ + for gene in self._genes_position: + if gene is not None: + yield gene + + @property + def organism(self) -> Organism: + """Return organism that Feature belongs to. + + :return: Organism of the feature + """ + return self._organism + + @organism.setter + def organism(self, organism: Organism): + """Set the organism belonging to the contig + + :param organism: Organism to set + + :raises TypeError: Given organism is not an instance Organism + """ + if not isinstance(organism, Organism): + raise TypeError(f'Expected type Organism, got {type(organism)}') + self._organism = organism + def add_rna(self, rna: RNA): """ Add RNA to contig :param rna: RNA object to add + + :raises TypeError: RNA is not instance RNA + :raises KeyError: Another RNA with the same ID already exists in the contig """ if not isinstance(rna, RNA): raise TypeError(f"'RNA' type was expected but you provided a '{type(rna)}' type object") - self.RNAs.add(rna) + if rna in self._rna_getter: + raise KeyError(f"RNA with the id: {rna.ID} already exist in contig {self.name}") + self._rna_getter.add(rna) - def add_gene(self, gene: Gene): - """ Add gene to Contig + @property + def RNAs(self) -> Generator[RNA, None, None]: + """Return all the RNA in the contig - :param gene: Gene object to add + :return: Generator of RNA """ - if not isinstance(gene, Gene): - raise TypeError(f"'Gene' type was expected but you provided a '{type(gene)}' type object") - if gene.position is None: - raise TypeError("The gene object needs to have its position in the contig filled before adding it") - while len(self._genes_position) <= gene.position: - # adding empty values. They should be filled by the end of the parsing. - # Doing this because genes are not always met in order. - self._genes_position.append(None) - self._genes_position[gene.position] = gene - self._genes_start[gene.start] = gene + yield from self._rna_getter class Organism(MetaFeatures): """ Describe the Genome content and some information - :param name: Name of the genome + Methods: + - `families`: Returns a set of gene families present in the organism. + - `genes`: Returns a generator to get genes in the organism. + - `number_of_genes`: Returns the number of genes in the organism. + - `contigs`: Returns the values in the contig dictionary from the organism. + - `get_contig`: Gets the contig with the given identifier in the organism, adding it if it does not exist. + - `_create_contig`: Creates a new contig object and adds it to the contig dictionary. + - `mk_bitarray`: Produces a bitarray representing the presence/absence of gene families in the organism using the provided index. + + Fields: + - `name`: Name of the organism. + - `bitarray`: Bitarray representing the presence/absence of gene families in the organism. """ + def __init__(self, name: str): + """Constructor Method + + :param name: Name of the genome + """ + assert isinstance(name, str), "Organism name should be a string" + assert name != "", "Organism name should not be empty" + super().__init__() self.name = name self._contigs_getter = {} + self._families = None self.bitarray = None + def __str__(self): + return self.name + + def _set_families(self): + """Set the set of gene families belonging to organism + """ + self._families = {gene.family for gene in self.genes} + + def __setitem__(self, name: str, contig: Contig): + """ Set contig to the organism + + :param name: Name of the contig + :param contig: Contig object to add in the organism + + :raises TypeError: If the contig is not instance Contig + :raises TypeError: If the name is not instance string + :raises KeyError: Contig with the given name already exist in the organism + """ + + if not isinstance(name, str): + raise TypeError(f"Contig name should be a string. You provided a '{type(name)}' type object") + if not isinstance(contig, Contig): + raise TypeError(f"'Contig' type was expected but you provided a '{type(contig)}' type object") + if name in self._contigs_getter: # Add test if contig are equivalent when __eq__ method will be defined in Contig + raise KeyError(f"Contig {contig.name} already in organism {self.name}") + self._contigs_getter[contig.name] = contig + contig.organism = self + + def __getitem__(self, name: str) -> Contig: + """Get the contig for the given position + + :param name: Name of the contig + + :return: Wanted contig for the given name + + :raises TypeError: If name is not a string + :raises KeyError: Name does not exist in the organism + """ + if not isinstance(name, str): + raise TypeError(f"Expected type is string, given type was '{type(name)}'") + try: + return self._contigs_getter[name] + except KeyError: + raise KeyError(f"Contig with the name: {name} does not exist in the organism") + + def __delitem__(self, name): + """Remove the contig for the given name + + :param name: Name of the contig + + :raises TypeError: If name is not a string + :raises KeyError: Name does not exist in the organism + """ + if not isinstance(name, int): + raise TypeError(f"Expected type is int, given type was '{type(name)}'") + try: + del self._contigs_getter[name] + except KeyError: + raise KeyError("Position of the gene in the contig does not exist") + @property - def families(self) -> set: - """ returns the gene families present in the organism + def families(self): + """Return the gene families present in the organism - :return: set of gene families in organism + :return: Generator of gene families + :rtype: Generator[GeneFamily, None, None] """ - return {gene.family for contig in self.contigs for gene in contig.genes} + if self._families is None: + self._set_families() + yield from self._families + + def number_of_families(self) -> int: + """Get the number of gene families in the organism + + :return: Number of gene families + """ + if self._families is None: + self._set_families() + return len(self._families) @property - def genes(self) -> Iterator[Gene]: - """ Generator to get genes in organism """ + def genes(self) -> Generator[Gene, None, None]: + """Generator to get genes in the organism + + :return: Generator of genes + """ for contig in self.contigs: - for gene in contig.genes: - yield gene + yield from contig.genes def number_of_genes(self) -> int: - """ Get number of genes in organism + """ Get number of genes in the organism - :return: Number of gene in organism + :return: Number of genes """ - return sum([len(list(contig.genes)) for contig in self.contigs]) + return sum([contig.number_of_genes for contig in self.contigs]) @property - def contigs(self) -> dict.values: - """ Get contigs in organism + def contigs(self) -> Generator[Contig, None, None]: + """ Generator of contigs in the organism - :return: values in contig dictionary from organism + :return: Values in contig dictionary from organism """ - return self._contigs_getter.values() + yield from self._contigs_getter.values() - def __str__(self): - return self.name + @property + def number_of_contigs(self) -> int: + """ Get number of contigs in organism + + :return: Number of contigs in organism + """ + return len(self._contigs_getter) + + def add(self, contig: Contig): + """Add a contig to organism - def get_contig(self, contig_id: str, is_circular: bool = False): + :param: Contig to add in organism + + :raises KeyError: Contig with the given name already exist in the organism """ - Get contig with the given identifier in the organim, if it does not exist in organism,the contig is added + assert isinstance(contig, Contig), f"Contig object is expected, given type was {type(contig)}" + try: + _ = self.get(contig.name) + except KeyError: + self[contig.name] = contig + else: + raise KeyError(f"Contig {contig.name} already in organism {self.name}") + + def get(self, name: str) -> Contig: + """ + Get contig with the given identifier in the organism + + :param name: Contig identifier - :param contig_id: Contig idenitifier - :param is_circular: save if the contig is circular + :return: The contig with the given identifier + """ + return self[name] - :return: the contig with the given identifier + def remove(self, name: str) -> Contig: """ - contig = self._contigs_getter.get(contig_id) - if contig is None: - contig = self._create_contig(contig_id, is_circular) - return contig + Remove a contig with the given identifier in the organism + + :param name: Contig identifier - def _create_contig(self, contig_id: str, is_circular: bool = False): - new_contig = Contig(contig_id, is_circular) - self._contigs_getter[contig_id] = new_contig - return new_contig + :return: The contig with the given identifier + """ + del self[name] def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): """Produces a bitarray representing the presence / absence of families in the organism using the provided index The bitarray is stored in the :attr:`bitarray` attribute and is a :class:`gmpy2.xmpz` type. - :param partition: Filter partition + :param partition: Filters partition :param index: The index computed by :func:`ppanggolin.pangenome.Pangenome.getIndex` - """ + :raises Exception: Partition is not recognized + """ self.bitarray = gmpy2.xmpz() # pylint: disable=no-member if partition == 'all': - logging.getLogger("PPanGGOLiN").debug(f"all") + logging.getLogger("PPanGGOLiN").debug("all") for fam in self.families: self.bitarray[index[fam]] = 1 elif partition in ['shell', 'cloud']: - logging.getLogger("PPanGGOLiN").debug(f"shell, cloud") + logging.getLogger("PPanGGOLiN").debug("shell, cloud") for fam in self.families: if fam.named_partition == partition: self.bitarray[index[fam]] = 1 elif partition == 'accessory': - logging.getLogger("PPanGGOLiN").debug(f"accessory") + logging.getLogger("PPanGGOLiN").debug("accessory") for fam in self.families: if fam.named_partition in ['shell', 'cloud']: self.bitarray[index[fam]] = 1 diff --git a/ppanggolin/graph/makeGraph.py b/ppanggolin/graph/makeGraph.py index 66187440..9ae94ed9 100644 --- a/ppanggolin/graph/makeGraph.py +++ b/ppanggolin/graph/makeGraph.py @@ -86,7 +86,7 @@ def compute_neighbors_graph(pangenome: Pangenome, remove_copy_number: int = 0, remove_high_copy_number(pangenome, remove_copy_number) logging.getLogger("PPanGGOLiN").info("Computing the neighbors graph...") - bar = tqdm(pangenome.organisms, total=len(pangenome.organisms), unit="organism", disable=disable_bar) + bar = tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="organism", disable=disable_bar) for org in bar: bar.set_description(f"Processing {org.name}") bar.refresh() @@ -103,9 +103,9 @@ def compute_neighbors_graph(pangenome: Pangenome, remove_copy_number: int = 0, raise AttributeError("a Gene does not have a GeneFamily object associated") except Exception: raise Exception("Unexpected error. Please report on our github.") - if prev is not None and contig.is_circular and len(contig.genes) > 0: + if prev is not None and contig.is_circular and contig.number_of_genes > 0: # if prev is None, the contig is entirely made of duplicated genes, so no edges are added - pangenome.add_edge(contig.genes[0], prev) + pangenome.add_edge(contig[0], prev) logging.getLogger("PPanGGOLiN").info("Done making the neighbors graph.") pangenome.status["neighborsGraph"] = "Computed" diff --git a/ppanggolin/main.py b/ppanggolin/main.py index 5dfbacf0..eb2a310c 100644 --- a/ppanggolin/main.py +++ b/ppanggolin/main.py @@ -73,6 +73,7 @@ def cmd_line() -> argparse.Namespace: desc += " align aligns a genome or a set of proteins to the pangenome gene families representatives and " \ "predict information from it\n" desc += " rgp predicts Regions of Genomic Plasticity in the genomes of your pangenome\n" + desc += " rgp_cluster cluster RGPs based on their gene families.\n" desc += " spot predicts spots in your pangenome\n" desc += " module Predicts functional modules in your pangenome\n" desc += " \n" @@ -93,6 +94,7 @@ def cmd_line() -> argparse.Namespace: subparsers = parser.add_subparsers(metavar="", dest="subcommand", title="subcommands", description=desc) subparsers.required = True # because python3 sent subcommands to hell apparently + # print help if no subcommand is specified if len(sys.argv) == 1: parser.print_help() @@ -168,10 +170,6 @@ def main(): if hasattr(args, "pangenome") and args.pangenome is not None: check_input_files(args.pangenome) - if hasattr(args, "fasta") and args.fasta is not None: - check_input_files(args.fasta, True) - if hasattr(args, "anno") and args.anno is not None: - check_input_files(args.anno, True) if args.subcommand == "annotate": ppanggolin.annotate.launch(args) @@ -205,6 +203,8 @@ def main(): ppanggolin.RGP.genomicIsland.launch(args) elif args.subcommand == "spot": ppanggolin.RGP.spot.launch(args) + elif args.subcommand == "rgp_cluster": + ppanggolin.RGP.rgp_cluster.launch(args) elif args.subcommand == "panrgp": ppanggolin.workflow.panRGP.launch(args) elif args.subcommand == "module": diff --git a/ppanggolin/meta/meta.py b/ppanggolin/meta/meta.py index 28689d5b..b07ea8c8 100644 --- a/ppanggolin/meta/meta.py +++ b/ppanggolin/meta/meta.py @@ -65,7 +65,6 @@ def check_metadata_format(metadata: Path, metatype: str) -> pd.DataFrame: :return: Dataframe with metadata loaded """ assert metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"] - colname_check = re.compile('^[a-zA-Z_][a-zA-Z0-9_]*$') metadata_df = pd.read_csv(metadata, sep="\t", header=0, quoting=csv.QUOTE_NONE, dtype={metatype: str}) @@ -85,14 +84,13 @@ def check_metadata_format(metadata: Path, metatype: str) -> pd.DataFrame: def assign_metadata(metadata_df: pd.DataFrame, pangenome: Pangenome, source: str, metatype: str, omit: bool = False, disable_bar: bool = False): - """ Add to pangenome element a metadata - - :param metadata_df: Dataframe with for each family a metadata - :param pangenome: Pangenome with gene families - :param source: source of the metadata - :param metatype: select to which pangenome element metadata will be added - :param omit: allow to omit a row in dataframe if the element name is not find in pangenomes - :param disable_bar: Disable progress bar + """function assigns metadata to elements in a pangenome based on a metadata dataframe. + :param metadata_df: A pandas dataframe containing metadata to be assigned to elements in the pangenome. + :param pangenome: A Pangenome object representing the pangenome to which metadata will be assigned. + :param source: A string representing the source of the metadata. + :param metatype: A string representing the type of element to which metadata will be assigned. + :param omit: A boolean indicating whether to raise an error if metadata cannot be assigned to an element. If True, metadata will not be assigned to elements that do not exist in the pangenome. If False, an error will be raised. Default is False. + :param disable_bar: A boolean indicating whether to disable the progress bar. Default is False. :raise KeyError: element name is not find in pangenome :raise AssertionError: Metatype is not recognized diff --git a/ppanggolin/metadata.py b/ppanggolin/metadata.py index 4866a8ad..a6f52cc3 100644 --- a/ppanggolin/metadata.py +++ b/ppanggolin/metadata.py @@ -2,47 +2,80 @@ # coding: utf8 # default libraries -from typing import Generator, List, Tuple, Union +import logging +from typing import Generator, List, Tuple, Union, Any +from collections import defaultdict # installed libraries from pandas import isna class Metadata: - """ - This represents a metadata link to genes, gene families, organisms, regions, spot or modules + """The Metadata class represents a metadata link to genes, gene families, organisms, regions, spot or modules. - :param source: source of the metadata - :param kwargs: all metadata name with there value - """ + Methods: + - number_of_attribute: Returns the number of attributes in the Metadata object. + - get: Returns the value of a specific attribute, or None if the attribute does not exist. + - fields: Returns a list of all the attributes in the Metadata object. + + Fields: + - source: A string representing the source of the metadata. + - **kwargs: A dictionary of attributes and values representing the metadata. The attributes can be any string, and the values can be any type except None or NaN. + """ def __init__(self, source: str, **kwargs): """Constructor Method + + :param source: A string representing the source of the metadata. + :param kwargs: A dictionary of attributes and values representing the metadata. The attributes can be any string, and the values can be any type except None or NaN. + + :raises TypeError: Source name is not a string + :raises Exception: Source name is empty + :raises Exception: Metadata is empty """ + if not isinstance(source, str): + raise TypeError(f"Metadata source name must be a string. Given type {type(source)}") + if source == "": + raise ValueError("Metadata source name should not be empty.") self.source = source + if len(kwargs) == 0: + raise Exception(f"No metadata given for source: {source}") for attr, value in kwargs.items(): + if isinstance(value, list): + value = self._join_list(value) if value is not None and not isna(value): - if isinstance(value, list): - value = self._join_list(value) setattr(self, attr, value) - def __len__(self): - return len(self.__dict__.keys()) + def __repr__(self): + return f"Metadata source: {self.source}, #attr: {len(self)}" - def get(self, name: str, skip_error: bool = False): - try: - value = self.__getattribute__(name) - except AttributeError as attr_error: - if skip_error: - return None - else: - raise attr_error - else: - return value + def __len__(self) -> int: + """Get the number of attribute links to the metadata object + + :return: Number of fields (atribute) of the metadata + """ + return len(self.__dict__) - 1 + + def __getattr__(self, item: str) -> Any: + """Get the value corresponding to the given attibute + + :return: Value of the attribute + + :raises AttributeError: The attribute does not exist in the metadata + """ + if item not in self.__dict__: + raise AttributeError(f"{item} is not an attribute of metadata") + return self.__dict__[item] @property def fields(self) -> List[str]: - return list(self.__dict__.keys()) + """Get all the field of the metadata + + :return: List of the field in the metadata + """ + fields = list(self.__dict__) + fields.remove("source") + return fields @staticmethod def _join_list(attr_list: Union[str, List[str]]): @@ -51,71 +84,106 @@ def _join_list(attr_list: Union[str, List[str]]): class MetaFeatures: """ - This represents a methods to access metadata in genes, gene families, organisms, regions, spot or modules + The MetaFeatures class provides methods to access and manipulate metadata in all ppanggolin classes. + + Methods + metadata: Generate all metadata from all sources. + sources: Generate all metadata sources. + get_metadata: Get metadata based on attribute values. + max_metadata_by_source: Gets the source with the maximum number of metadata and the corresponding count. """ + def __init__(self): - self._metadataGetter = {} + """Constructor method + """ + self._metadata_getter = defaultdict(list) @property def metadata(self) -> Generator[Metadata, None, None]: - """Generate metadatas in gene families + """Generate metadata in gene families - :return: Generator with all metadata from all sources + :return: Metadata from all sources """ - for meta_list in self._metadataGetter.values(): + for meta_list in self._metadata_getter.values(): for metadata in meta_list: yield metadata @property - def sources(self) -> List[str]: + def sources(self) -> Generator[str, None, None]: """ Get all metadata source in gene family - :return: List of metadata source + :return: Metadata source + """ + yield from self._metadata_getter.keys() + + def add_metadata(self, source, metadata): + """Add metadata to metadata getter + + :param source: Name of the metadata source + :param metadata: metadata value to add for the source + + :raises AssertionError: Source or metadata is not with the correct type """ - return list(self._metadataGetter.keys()) + assert isinstance(metadata, Metadata), f"Metadata is not with type Metadata but with {type(metadata)}" + assert isinstance(source, str), f"Source is not a string but with {type(source)}" + + self._metadata_getter[source].append(metadata) + + def get_metadata_by_source(self, source: str) -> Union[List[Metadata], None]: + """Get all the metadata feature corresponding to the source - def get_source(self, source: str) -> Union[List[Metadata], None]: - """ Get the metadata for a specific source in gene family + :param source: Name of the source to get - :param source: Name of the source + :return: List of metadata corresponding to the source - :return: All the metadata from the source if exist else None + :raises AssertionError: Source is not with the correct type """ - return self._metadataGetter[source] if source in self.sources else None + assert isinstance(source, str), f"Source is not a string but with {type(source)}" + return self._metadata_getter.get(source) # if source in _metadata_getter return value else None - def get_metadata(self, **kwargs) -> Generator[Metadata, None, None]: + def get_metadata_by_attribute(self, **kwargs) -> Generator[Metadata, None, None]: """Get metadata by one or more attribute - :return: metadata searched + :return: Metadata searched """ for metadata in self.metadata: for attr, value in kwargs.items(): if hasattr(metadata, attr): - if metadata.__getattribute__(attr) in value or metadata.__getattribute__(attr) == value: + # BUG If value is a list, the join block detection. + # It would be better to keep a list and change in writing and reading metadata to join the list + if getattr(metadata, attr, None) in value or getattr(metadata, attr, None) == value: yield metadata - def add_metadata(self, source: str, metadata: Metadata): - """ Add metadata + def del_metadata_by_source(self, source: str): + """Remove a source from the feature - :param source: Name of database source - :param metadata: Identifier of the metadata + :param source: Name of the source to delete + + :raises AssertionError: Source is not with the correct type + :raises KeyError: Source does not belong in the MetaFeature """ - assert isinstance(metadata, Metadata) - source_annot = self.get_source(source) - if source_annot is not None: - self._metadataGetter[source].append(metadata) - else: - self._metadataGetter[source] = [metadata] + assert isinstance(source, str), f"Source is not a string but with {type(source)}" + if self._metadata_getter.pop(source, None) is None: + logging.getLogger("PPanGGOLiN").warning("The source to remove does not exist") + + def del_metadata_by_attribute(self, **kwargs): + """Remove a source from the feature + + :param source: Name of the source to delete + """ + for source, metadata in self._metadata_getter.items(): + for attr, value in kwargs.items(): + if hasattr(metadata, attr): + # BUG If value is a list, the join block detection. + # It would be better to keep a list and change in writing and reading metadata to join the list + if getattr(metadata, attr, None) in value or getattr(metadata, attr, None) == value: + self._metadata_getter[source].remove(metadata) def max_metadata_by_source(self) -> Tuple[str, int]: - """Get the maximum number of annotation for one source - :return: Name of the source with the maximum annotation and the number of annotation corresponding + """Get the maximum number of metadata for one source + + :return: Name of the source with the maximum annotation and the number of metadata corresponding """ - max_meta = 0 - max_source = None - for source, metadata in self._metadataGetter.items(): - if len(metadata) > max_meta: - max_meta = len(metadata) - max_source = source - return max_source, max_meta \ No newline at end of file + max_source, max_meta = max(self._metadata_getter.items(), key=lambda x: len(x[1])) + return max_source, len(max_meta) diff --git a/ppanggolin/metrics/fluidity.py b/ppanggolin/metrics/fluidity.py index b8aaf46b..6c44fcb5 100644 --- a/ppanggolin/metrics/fluidity.py +++ b/ppanggolin/metrics/fluidity.py @@ -40,7 +40,7 @@ def gen_fluidity(pangenome: Pangenome, disable_bar: bool = False) -> dict: common_fam = popcount(c_organisms[0].bitarray & c_organisms[1].bitarray) - 1 if tot_fam > 0 and common_fam > 0: g_sum += (tot_fam - 2 * common_fam) / tot_fam - fluidity_dict[subset] = (2 / (pangenome.number_of_organisms() * (pangenome.number_of_organisms() - 1))) * g_sum + fluidity_dict[subset] = (2 / (pangenome.number_of_organisms * (pangenome.number_of_organisms - 1))) * g_sum return fluidity_dict @@ -90,8 +90,8 @@ def fam_fluidity(pangenome: Pangenome, disable_bar: bool = False) -> dict: common_fam = popcount(c_fam[0].bitarray & c_fam[1].bitarray) - 1 if tot_org > 0 and common_fam > 0: f_sum += (tot_org - 2 * common_fam) / tot_org - fluidity_dict[subset] = (2 / (pangenome.number_of_gene_families() * - (pangenome.number_of_gene_families() - 1))) * f_sum + fluidity_dict[subset] = (2 / (pangenome.number_of_gene_families * + (pangenome.number_of_gene_families - 1))) * f_sum return fluidity_dict diff --git a/ppanggolin/mod/module.py b/ppanggolin/mod/module.py index 2058fbb2..df3c8c5b 100644 --- a/ppanggolin/mod/module.py +++ b/ppanggolin/mod/module.py @@ -32,24 +32,24 @@ def check_pangenome_former_modules(pangenome: Pangenome, force: bool = False): erase_pangenome(pangenome, modules=True) -def compute_mod_graph(organisms: list, t: int = 1, disable_bar: bool = False): +def compute_mod_graph(pangenome: Pangenome, t: int = 1, disable_bar: bool = False): """ Computes a graph using all provided genomes with a transitive closure of size t - :param organisms: the list of organisms to compute the graph with + :param pangenome: pangenome with organisms to compute the graph :param t: the size of the transitive closure :param disable_bar: whether to show a progress bar or not """ g = nx.Graph() - for org in tqdm(organisms, unit="genome", disable=disable_bar): + for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar): for contig in org.contigs: - if len(contig.genes) > 0: - start_gene = contig.genes[0] + if contig.number_of_genes > 0: + start_gene = contig[0] g.add_node(start_gene.family) add_gene(g.nodes[start_gene.family], start_gene, fam_split=False) for i, gene in enumerate(contig.genes): - for j, a_gene in enumerate(contig.genes[i + 1:i + t + 2], start=i + 1): + for j, a_gene in enumerate(contig.get_genes(i + 1, i + t + 2), start=i + 1): g.add_edge(gene.family, a_gene.family) edge = g[gene.family][a_gene.family] add_gene(edge, gene) @@ -73,7 +73,7 @@ def compute_modules(g: nx.Graph, multi: set, weight: float = 0.85, min_fam: int """ # removing families with low presence - removed = set([fam for fam in g.nodes if len(fam.organisms) < min_fam]) + removed = set([fam for fam in g.nodes if fam.number_of_organisms < min_fam]) modules = set() c = 0 @@ -109,7 +109,7 @@ def predict_modules(pangenome: Pangenome, dup_margin: float = 0.05, size: int = # compute the graph with transitive closure size provided as parameter start_time = time.time() logging.getLogger("PPanGGOLiN").info("Building the graph...") - g = compute_mod_graph(pangenome.organisms, t=transitive, disable_bar=disable_bar) + g = compute_mod_graph(pangenome, t=transitive, disable_bar=disable_bar) logging.getLogger("PPanGGOLiN").info(f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find modules in") logging.getLogger("PPanGGOLiN").info(f"There are {nx.number_of_nodes(g)} nodes and {nx.number_of_edges(g)} edges") @@ -122,13 +122,12 @@ def predict_modules(pangenome: Pangenome, dup_margin: float = 0.05, size: int = fams = set() for mod in modules: - fams |= mod.families + fams |= set(mod.families) + pangenome.add_module(mod) logging.getLogger("PPanGGOLiN").info(f"There are {len(fams)} families among {len(modules)} modules") logging.getLogger("PPanGGOLiN").info(f"Computing modules took {round(time.time() - start_time, 2)} seconds") - pangenome.add_modules(modules) - pangenome.status["modules"] = "Computed" pangenome.parameters["module"] = {} pangenome.parameters["module"]["size"] = size diff --git a/ppanggolin/nem/partition.py b/ppanggolin/nem/partition.py index 8094ebaf..dfa852a4 100644 --- a/ppanggolin/nem/partition.py +++ b/ppanggolin/nem/partition.py @@ -272,10 +272,11 @@ def write_nem_input_files(tmpdir: Path, organisms: set, sm_degree: int = 10) -> index_org[org] = index for fam in pan.gene_families: + fam_organisms = set(fam.organisms) # could use bitarrays if this part is limiting? - if not organisms.isdisjoint(fam.organisms): + if not organisms.isdisjoint(fam_organisms): curr_dat = list(default_dat) - curr_orgs = fam.organisms & organisms + curr_orgs = fam_organisms & organisms for org in curr_orgs: curr_dat[index_org[org]] = "1" dat_file.write("\t".join(curr_dat) + "\n") @@ -288,7 +289,7 @@ def write_nem_input_files(tmpdir: Path, organisms: set, sm_degree: int = 10) -> neighbor_number = 0 sum_dist_score = 0 for edge in fam.edges: # iter on the family's edges. - coverage = sum([len(gene_list) for org, gene_list in edge.organisms.items() if org in organisms]) + coverage = sum([len(gene_list) for org, gene_list in edge.get_organisms_dict().items() if org in organisms]) if coverage == 0: continue # nothing interesting to write, this edge does not exist with this subset of organisms. distance_score = coverage / len(organisms) diff --git a/ppanggolin/nem/rarefaction.py b/ppanggolin/nem/rarefaction.py index c2a5363a..af2e33b6 100644 --- a/ppanggolin/nem/rarefaction.py +++ b/ppanggolin/nem/rarefaction.py @@ -89,7 +89,7 @@ def validate_family(result: Union[Tuple[dict, None, None], Tuple[int, float, flo validated.add(node) for fam in ppp.pan.gene_families: - if not samp.isdisjoint(fam.organisms): # otherwise, useless to keep track of + if not samp.isdisjoint(set(fam.organisms)): # otherwise, useless to keep track of families.add(fam) cpt_partition[fam.name] = {"P": 0, "S": 0, "C": 0, "U": 0} @@ -371,8 +371,8 @@ def make_rarefaction_curve(pangenome: Pangenome, output: Path, tmpdir: Path = No tmpdir_obj = tempfile.TemporaryDirectory(dir=tmpdir) tmp_path = Path(tmpdir_obj.name) - if float(len(pangenome.organisms)) < max_sampling: - max_sampling = len(pangenome.organisms) + if float(pangenome.number_of_organisms) < max_sampling: + max_sampling = pangenome.number_of_organisms else: max_sampling = int(max_sampling) @@ -399,7 +399,7 @@ def make_rarefaction_curve(pangenome: Pangenome, output: Path, tmpdir: Path = No index_org = pangenome.compute_family_bitarrays() logging.getLogger("PPanGGOLiN").info("Done computing bitarrays. Comparing them to get exact and soft core stats " f"for {len(all_samples)} samples...") - bar = tqdm(range(len(all_samples) * len(pangenome.gene_families)), unit="gene family", disable=disable_bar) + bar = tqdm(range(len(all_samples) * pangenome.number_of_gene_families), unit="gene family", disable=disable_bar) for samp in all_samples: # make the sample's organism bitarray. samp_bitarray = gmpy2.xmpz() # pylint: disable=no-member diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index 52ec7ce2..ba35e4c1 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -2,9 +2,10 @@ # coding: utf8 # default libraries -from pathlib import Path -from typing import Iterator, List, Union, Dict, Set, Iterable, Generator +import logging import re +from typing import Iterator, List, Union, Dict, Set, Iterable, Generator +from pathlib import Path # local libraries from ppanggolin.genome import Organism, Gene @@ -31,12 +32,12 @@ def __init__(self): self._famGetter = {} self._org_index = None self._fam_index = None - self.max_fam_id = 0 + self._max_fam_id = 0 self._orgGetter = {} self._edgeGetter = {} self._regionGetter = {} - self._spots = set() - self._modules = set() + self._spotGetter = {} + self._moduleGetter = {} self.status = { 'genomesAnnotated': "No", @@ -70,41 +71,34 @@ def add_file(self, pangenome_file: Path): :func:`ppanggolin.formats.writeBinaries.writePangenome` is called. :param pangenome_file: A string representing filepath to hdf5 pangenome file to be either used or created + + :raises AssertionError: If the `pangenome_file` is not an instance of the Path class """ + assert isinstance(pangenome_file, Path), "pangenome file should be a Path object type" from ppanggolin.formats.readBinaries import get_status # importing on call instead of importing on top to avoid cross-reference problems. get_status(self, pangenome_file) self.file = pangenome_file.absolute().as_posix() """ Gene Methods""" - @property - def genes(self) -> list: - """Creates the geneGetter if it does not exist, and returns all the genes of all organisms in the pangenome. + def genes(self) -> Generator[Gene, None, None]: + """Generator of genes in the pangenome. - :return: list of :class:`ppanggolin.genome.Gene` - """ - try: - return list(self._geneGetter.values()) - except AttributeError: # in that case the gene getter has not been computed - self._mk_gene_getter() # make it - return self.genes # return what was expected - - def _yield_genes(self) -> Iterator[Gene]: - """ Use a generator to get all the genes of a pangenome - - :return: an iterator of Gene + :return: gene generator """ - if self.number_of_organisms() > 0: # if we have organisms, they're supposed to have genes + if self.number_of_organisms > 0: # if we have organisms, they're supposed to have genes for org in self.organisms: for contig in org.contigs: for gene in contig.genes: yield gene - elif self.number_of_gene_families() > 0: + elif self.number_of_gene_families > 0: # we might have no organism loaded, in that case there are gene families. for gene_fam in self.gene_families: for gene in gene_fam.genes: yield gene + else: + logging.getLogger("PPanGGOLiN").warning("There is no gene in your pangenome") def _mk_gene_getter(self): """ @@ -112,35 +106,39 @@ def _mk_gene_getter(self): Since the genes are never explicitly 'added' to a pangenome (but rather to a gene family, or a contig), the pangenome cannot directly extract a gene from a geneID since it does not 'know' them. - if at some point we want to extract genes from a pangenome we'll create a geneGetter. + If at some point we want to extract genes from a pangenome we'll create a geneGetter. The assumption behind this is that the pangenome has been filled and no more gene will be added. """ self._geneGetter = {} - for gene in self._yield_genes(): + for gene in self.genes: self._geneGetter[gene.ID] = gene def get_gene(self, gene_id: str) -> Gene: - """returns the gene that has the given geneID + """Returns the gene that has the given gene ID :param gene_id: The gene ID to look for - :return: returns the gene that has the ID `geneID` + :return: Returns the gene that has the ID `gene_id` - :raises KeyError: If the `geneID` is not in the pangenome + :raises AssertionError: If the `gene_id` is not an integer + :raises KeyError: If the `gene_id` is not in the pangenome """ + assert isinstance(gene_id, str), "Gene id should be an integer" + try: return self._geneGetter[gene_id] except AttributeError: # in that case, either the gene getter has not been computed, or the geneID is not in the pangenome. self._mk_gene_getter() # make it - return self.get_gene(gene_id) # return what was expected. If geneID does not exist it will raise an error. + return self.get_gene(gene_id) # Return what was expected. If geneID does not exist it will raise an error. except KeyError: raise KeyError(f"{gene_id} does not exist in the pangenome.") - def number_of_gene(self) -> int: + @property + def number_of_genes(self) -> int: """Returns the number of gene present in the pangenome - :return: the number of gene families + :return: The number of genes """ try: return len(self._geneGetter) @@ -149,78 +147,106 @@ def number_of_gene(self) -> int: return len(self._geneGetter) """Gene families methods""" - @property - def gene_families(self) -> List[GeneFamily]: - """returns all the gene families in the pangenome - - :return: list of :class:`ppanggolin.geneFamily.GeneFamily` + def max_fam_id(self): + """Get the last family identifier """ - return list(self._famGetter.values()) + return self._max_fam_id - def _create_gene_family(self, name: str) -> GeneFamily: - """Creates a gene family object with the given `name` + @max_fam_id.setter + def max_fam_id(self, value): + """Set the last family identifier - :param name: the name to give to the gene family. Must not exist already. + :param value: value of the maximum family identifer + """ + self._max_fam_id = value - :return: the created GeneFamily object + @property + def gene_families(self) -> Generator[GeneFamily, None, None]: + """Returns all the gene families in the pangenome + + :return: Generator of gene families """ - new_fam = GeneFamily(family_id=self.max_fam_id, name=name) - self.max_fam_id += 1 - self._famGetter[new_fam.name] = new_fam - return new_fam + for family in self._famGetter.values(): + yield family + @property def number_of_gene_families(self) -> int: """Returns the number of gene families present in the pangenome - :return: the number of gene families + :return: The number of gene families """ return len(self._famGetter) def get_gene_family(self, name: str) -> GeneFamily: - """returns the gene family that has the given `name` + """Returns the gene family that has the given `name` :param name: The gene family name to look for - :return: returns the gene family that has the name `name` + :return: Returns the gene family that has the name `name` + + :raises AssertionError: If the `name` is not an integer + :raises KeyError: If the `name` is not corresponding to any family in the pangenome """ - return self._famGetter[name] + assert isinstance(name, str), "Name of gene family should be a string" + try: + fam = self._famGetter[name] + except KeyError: + raise KeyError(f"Gene family with name={name} is not in pangenome") + except Exception as error: + raise Exception(error) + else: + return fam - def add_gene_family(self, name: str): + def add_gene_family(self, family: GeneFamily): """ Get the :class:`ppanggolin.geneFamily.GeneFamily` object that has the given `name`. If it does not exist, creates it. - :param name: The gene family name to get if it exists, and create otherwise. + :param family: The gene family to add in pangenomes - :return: GeneFamily object. + :raises KeyError: Exception if family with the same name already in pangenome + :raises Exception: Unexpected exception """ - fam = self._famGetter.get(name) - if fam is None: - fam = self._create_gene_family(name) - return fam + try: + _ = self.get_gene_family(family.name) + except KeyError: + self._famGetter[family.name] = family + self.max_fam_id += 1 + except Exception as error: + raise Exception(error) + else: + raise KeyError("Gene Family already exist") """Graph methods""" - @property - def edges(self) -> list: - """returns all the edges in the pangenome graph + def edges(self) -> Generator[Edge, None, None]: + """Returns all the edges in the pangenome graph - :return: list of :class:`ppanggolin.pangenome.Edge` + :return: Generator of edge """ - return list(self._edgeGetter.values()) + for edge in self._edgeGetter.values(): + yield edge def add_edge(self, gene1: Gene, gene2: Gene) -> Edge: """ Adds an edge between the two gene families that the two given genes belong to. - Genes object are expected, and they are also expected to have a family assigned :param gene1: The first gene :param gene2: The second gene - :return: the created Edge + :return: The created Edge + + :raises AssertionError: Genes object are expected + :raises AttributeError: Genes are not associated to any families """ - key = frozenset([gene1.family, gene2.family]) + assert isinstance(gene1, Gene) and isinstance(gene2, Gene), "Gene object are expected" + try: + family_1, family_2 = gene1.family, gene2.family + except AttributeError: + raise AttributeError("Genes are not linked to families. Check that you compute the gene families and post an" + " issue on our GitHub") + key = frozenset([family_1, family_2 ]) edge = self._edgeGetter.get(key) if edge is None: edge = Edge(gene1, gene2) @@ -229,74 +255,70 @@ def add_edge(self, gene1: Gene, gene2: Gene) -> Edge: edge.add_genes(gene1, gene2) return edge - def number_of_edge(self) -> int: + @property + def number_of_edges(self) -> int: """Returns the number of edge present in the pangenome - :return: the number of gene families + :return: The number of gene families """ return len(self._edgeGetter) """Organism methods""" - @property - def organisms(self) -> List[Organism]: - """returns all the organisms in the pangenome + def organisms(self) -> Generator[Organism, None, None]: + """Returns all the organisms in the pangenome - :return: list of :class:`ppanggolin.genome.Organism` + :return: Generator :class:`ppanggolin.genome.Organism` """ - return list(self._orgGetter.values()) + for organism in self._orgGetter.values(): + yield organism + @property def number_of_organisms(self) -> int: """Returns the number of organisms present in the pangenome - :return: the number of organism + :return: The number of organism """ return len(self._orgGetter) - def get_organism(self, org_name: str) -> Organism: + def get_organism(self, name: str) -> Organism: """ Get an organism that is expected to be in the pangenome using its name, which is supposedly unique. Raises an error if the organism does not exist. - :param org_name: Name of the Organism to get + :param name: Name of the Organism to get :return: The related Organism object - :raises KeyError: If the provided name is not in the pangenome + :raise AssertionError: If the organism name is not a string + :raises KeyError: If the provided name is not an organism in the pangenome """ + assert isinstance(name, str), "Organism name should be a string" try: - return self._orgGetter[org_name] + return self._orgGetter[name] except KeyError: - raise KeyError(f"{org_name} does not seem to be in your pangenome") + raise KeyError(f"{name} does not seem to be in your pangenome") - def add_organism(self, new_org: Union[Organism, str]) -> Organism: + def add_organism(self, organism: Organism): """ - adds an organism that did not exist previously in the pangenome if an Organism object is provided. + Adds an organism that did not exist previously in the pangenome if an Organism object is provided. If an organism with the same name exists it will raise an error. If a str object is provided, will return the corresponding organism that has this name OR create a new one if it does not exist. - :param new_org: Organism to add to the pangenome - - :return: The created organism + :param organism: Organism to add to the pangenome - :raises TypeError: if the provided `newOrg` is neither a str nor a :class:`ppanggolin.genome.Organism` + :raise AssertionError: If the organism name is not a string + :raises KeyError: if the provided organism is already in pangenome """ - if isinstance(new_org, Organism): - old_len = len(self._orgGetter) - self._orgGetter[new_org.name] = new_org - if len(self._orgGetter) == old_len: - raise KeyError(f"Redondant organism name was found ({new_org.name})." - f"All of your organisms must have unique names.") - elif isinstance(new_org, str): - org = self._orgGetter.get(new_org) - if org is None: - org = Organism(new_org) - self._orgGetter[org.name] = org - new_org = org + assert isinstance(organism, Organism), "An organism object is expected to be add to pangenome" + try: + self.get_organism(organism.name) + except KeyError: + self._orgGetter[organism.name] = organism else: - raise TypeError("Provide an Organism object or a str that will serve as organism name") - return new_org + raise KeyError(f"Redondant organism name was found ({organism.name})." + f"All of your organisms must have unique names.") def get_org_index(self) -> Dict[Organism, int]: # will not make a new index if it exists already """Creates an index for Organisms (each organism is assigned an Integer). @@ -318,7 +340,7 @@ def compute_family_bitarrays(self, part: str = 'all') -> Dict[Organism, int]: :param part: Filter the organism in function of the given partition - :return: the index of organisms in pangenome + :return: The index of organisms in pangenome """ if self._org_index is None: # then the bitarrays don't exist yet, since the org index does not exist either. @@ -359,112 +381,91 @@ def compute_org_bitarrays(self, part='all') -> Dict[GeneFamily, int]: return self._fam_index """RGP methods""" - @property - def regions(self) -> list: + def regions(self) -> Generator[Region, None, None]: """returns all the regions (RGP) in the pangenome :return: list of RGP """ - return list(self._regionGetter.values()) + for region in self._regionGetter.values(): + yield region - def get_region(self, region_name: str) -> Region: + def get_region(self, name: str) -> Region: """Returns a region with the given region_name. Creates it if it does not exist. - :param region_name: The name of the region to return + :param name: The name of the region to return :return: The region + + :raise AssertionError: If the RGP name is not a string + :raises KeyError: If the provided name is not a RGP in the pangenome """ + assert isinstance(name, str), "RGP name should be a string" + try: - return self._regionGetter[region_name] + rgp = self._regionGetter[name] except KeyError: # then the region is not stored in this pangenome. - new_region = Region(region_name) - self._regionGetter[region_name] = new_region - return new_region + raise KeyError(f"There is no RGP with name={name}") + else: + return rgp def get_multigenics(self, dup_margin: float, persistent: bool = True) -> Set[GeneFamily]: """ Returns the multigenic persistent families of the pangenome graph. A family will be considered multigenic if it is duplicated in more than `dup_margin` of the genomes where it is present. - :param dup_margin: the ratio of presence in multicopy above which a gene family is considered multigenic + :param dup_margin: The ratio of presence in multicopy above which a gene family is considered multigenic :param persistent: if we consider only the persistent genes - :return: set of gene families considered multigenic + :return: Set of gene families considered multigenic """ + assert isinstance(dup_margin, float), "Dup margin should be a float" + assert isinstance(persistent, bool), "persistent should be a boolean" + multigenics = set() for fam in self.gene_families: if fam.named_partition == "persistent" or not persistent: dup = len([genes for org, genes in fam.get_org_dict().items() if len([gene for gene in genes if not gene.is_fragment]) > 1]) - if (dup / len(fam.organisms)) >= dup_margin: # tot / nborgs >= 1.05 + if (dup / fam.number_of_organisms) >= dup_margin: # tot / nborgs >= 1.05 multigenics.add(fam) - # logging.getLogger("PPanGGOLiN").info(f"{len(multigenics)} gene families are defined as being multigenic. - # (duplicated in more than {dup_margin} of the genomes)") return multigenics - def add_regions(self, region_group: Union[Region, Iterable[Region]]): - """Takes an Iterable or a Region object and adds it to the pangenome + def add_region(self, region: Region): + """Add a region to the pangenome - :param region_group: a region or an Iterable of regions to add to the pangenome + :param region: Region to add in pangenome - :raises TypeError: if regionGroup is neither a Region nor an Iterable[Region] + :raise AssertionError: Error if region is not a Region object + :raise KeyError: Error if another Region exist in pangenome with the same name """ - old_len = len(self._regionGetter) - if isinstance(region_group, Iterable): - for region in region_group: - self._regionGetter[region.name] = region - if len(self._regionGetter) != len(region_group) + old_len: - raise Exception("Two regions had an identical name, which was unexpected.") - elif isinstance(region_group, Region): - self._regionGetter[region_group.name] = region_group + assert isinstance(region, Region), "A Region object is expected" + + try: + self.get_region(region.name) + except KeyError: + self._regionGetter[region.name] = region else: - raise TypeError(f"An iterable or a 'Region' type object were expected, " - f"but you provided a {type(region_group)} type object") + raise KeyError(f"A RGP with this name ({region.name} already exist in pangenome") + @property def number_of_rgp(self) -> int: """Returns the number of gene families present in the pangenome - :return: the number of gene families + :return: The number of gene families """ return len(self._regionGetter) """Spot methods""" - @property - def spots(self) -> Set[Spot]: - # TODO made as generator - return self._spots - def _yield_spot(self) -> Generator[Spot, None, None]: - """ Use a generator to get all the genes of a pangenome + def spots(self) -> Generator[Spot, None, None]: + """Generate spots in the pangenome - :return: an iterator of Gene - """ - if self.number_of_spots() > 0: # if we have organisms, they're supposed to have genes - for spot in self.spots: - yield spot - - def _mk_spot_getter(self): - """ - Builds the attribute _geneGetter of the pangenome - - Since the genes are never explicitly 'added' to a pangenome (but rather to a gene family, or a contig), - the pangenome cannot directly extract a gene from a geneID since it does not 'know' them. - if at some point we want to extract genes from a pangenome we'll create a geneGetter. - The assumption behind this is that the pangenome has been filled and no more gene will be added. - """ - self._spotGetter = {} - for spot in self._yield_spot(): - self._spotGetter[spot.ID] = spot - - def add_spots(self, spots: Iterable[Spot]): - """Adds the given iterable of spots to the pangenome. - - :param spots: An iterable of :class:`ppanggolin.region.Spot`. - """ - self._spots |= set(spots) + :return: Spot generator""" + yield from self._spotGetter.values() def get_spot(self, spot_id: Union[int, str]) -> Spot: + # TODO Change for only str or only int """ Returns the spot that has the given spot ID. @@ -475,68 +476,57 @@ def get_spot(self, spot_id: Union[int, str]) -> Spot: :raises KeyError: If the spot ID does not exist in the pangenome. :raises ValueError: If the provided spot ID does not have the expected format. """ - try: spot_id = int(spot_id) except ValueError: - result = re.search("^spot_(\d+)$", spot_id) + result = re.search(r"^spot_(\d+)$", spot_id) if result: spot_id = int(result.group(1)) else: raise ValueError(f"The provided spot ID '{spot_id}' does not have the expected format." "It should be an integer or in the format 'spot_'.") try: - return self._spotGetter[spot_id] - except AttributeError: - # in that case, either the gene getter has not been computed, or the spot id is not in the pangenome. - self._mk_spot_getter() # make it - return self.get_spot(spot_id) # return what was expected. If spot id does not exist it will raise an error. + spot = self._spotGetter[spot_id] except KeyError: raise KeyError(f"Spot {spot_id} does not exist in the pangenome.") + else: + return spot - def number_of_spots(self) -> int: - """Returns the number of gene families present in the pangenome - - :return: the number of gene families - """ - return len(self.spots) - - """Modules methods""" + def add_spot(self, spot: Spot): + """Adds the given iterable of spots to the pangenome. - @property - def modules(self) -> Set[Module]: - # TODO made as generator - return self._modules - def _yield_module(self) -> Generator[Module, None, None]: - """ Use a generator to get all the genes of a pangenome + :param spot: Spot which should be added - :return: an iterator of Gene + :raise AssertionError: Error if spot is not a Spot object + :raise KeyError: Error if another Spot exist in pangenome with the same identifier """ - if self.number_of_modules() > 0: # if we have organisms, they're supposed to have genes - for module in self.modules: - yield module + assert isinstance(spot, Spot), "Spot object is expected" + try: + self.get_spot(spot.ID) + except KeyError: + self._spotGetter[spot.ID] = spot + except Exception as error: + raise Exception(error) + else: + raise KeyError("Spot already exist") - def _mk_module_getter(self): - """ - Builds the attribute _geneGetter of the pangenome + @property + def number_of_spots(self) -> int: + """Returns the number of gene families present in the pangenome - Since the genes are never explicitly 'added' to a pangenome (but rather to a gene family, or a contig), - the pangenome cannot directly extract a gene from a geneID since it does not 'know' them. - if at some point we want to extract genes from a pangenome we'll create a geneGetter. - The assumption behind this is that the pangenome has been filled and no more gene will be added. + :return: The number of gene families """ - self._moduleGetter = {} - for module in self._yield_module(): - self._moduleGetter[module.ID] = module + return len(self._spotGetter) - def add_modules(self, modules: Iterable[Module]): - """Adds the given iterable of modules to the pangenome - - :param modules: an iterable of :class:`ppanggolin.module.Module` + """Modules methods""" + @property + def modules(self) -> Generator[Module, None, None]: + """Generate modules in the pangenome """ - self._modules |= set(modules) + yield from self._moduleGetter.values() def get_module(self, module_id: Union[int, str]) -> Module: + # TODO Change for only str or only int """ Returns the module that has the given module ID. @@ -551,7 +541,7 @@ def get_module(self, module_id: Union[int, str]) -> Module: try: module_id = int(module_id) except ValueError: - result = re.search("^module_(\d+)$", module_id) + result = re.search(r"^module_(\d+)$", module_id) if result: module_id = int(result.group(1)) else: @@ -559,14 +549,29 @@ def get_module(self, module_id: Union[int, str]) -> Module: "It should be an integer or in the format 'module_'.") try: - return self._moduleGetter[module_id] - except AttributeError: - # in that case, either the gene getter has not been computed, or the geneID is not in the pangenome. - self._mk_module_getter() # make it - return self.get_module( - module_id) # return what was expected. If geneID does not exist it will raise an error. + module = self._moduleGetter[module_id] except KeyError: raise KeyError(f"Module {module_id} does not exist in the pangenome.") + else: + return module + + def add_module(self, module: Module): + """Add the given module to the pangenome + + :param module: Module to add in pangenome + + :raise AssertionError: Error if module is not a Module object + :raise KeyError: Error if another module exist in pangenome with the same name + """ + assert isinstance(module, Module), "Module object is expected" + try: + self.get_module(module.ID) + except KeyError: + self._moduleGetter[module.ID] = module + except Exception as error: + raise Exception(error) + else: + raise KeyError("Module already exist") def compute_mod_bitarrays(self, part: str = 'all') -> Dict[GeneFamily, int]: """Based on the index generated by get_fam_index, generated a bitarray @@ -587,37 +592,53 @@ def compute_mod_bitarrays(self, part: str = 'all') -> Dict[GeneFamily, int]: # case where there is an index but the bitarrays have not been computed??? return self._fam_index + @property def number_of_modules(self) -> int: """Returns the number of modules present in the pangenome - :return: the number of modules + :return: The number of modules """ - return len(self.modules) + return len(self._moduleGetter) """Metadata""" + def select_elem(self, metatype: str): + """Get all the element for the given metatype - def metadata_sources(self, metatype: str) -> Set[str]: - """returns all the metadata source in the pangenomes + :param metatype: Name of pangenome component that will be get - :param metatype: select to which pangenome element metadata should be searched + :return: All elements from pangenome for the metatype - :return: set of metadata source + :raise AssertionError: Error if metatype is not a string + :raise KeyError: Error if metatype is not recognized """ - assert metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"] - source_set = set() + assert isinstance(metatype, str), "Metatype name should be a string" + if metatype == "families": - elements = self.gene_families + return self.gene_families elif metatype == "genomes": - elements = self.organisms + return self.organisms elif metatype == "genes": - elements = self.genes + return self.genes elif metatype == "RGPs": - elements = self.regions + return self.regions elif metatype == "spots": - elements = self.spots - else: # metatype == "modules": - elements = self.modules - for elem in elements: + return self.spots + elif metatype == "modules": + return self.modules + else: + raise KeyError("Given metatype is not allowed") + + def metadata_sources(self, metatype: str) -> Set[str]: + """Returns all the metadata source in the pangenomes + + :param metatype: Select to which pangenome element metadata should be searched + + :return: Set of metadata source + + :raise AssertionError: Error if metatype is not a string + """ + source_set = set() + for elem in self.select_elem(metatype): for source_metadata in elem.sources: source_set.add(source_metadata) return source_set @@ -625,42 +646,25 @@ def metadata_sources(self, metatype: str) -> Set[str]: def metadata(self, metatype: str) -> Generator[Metadata, None, None]: """Create a generator with all metadatas in the pangenome - :return: set of metadata source + :param metatype: Select to which pangenome element metadata should be generate + + :return: Set of metadata source """ - assert metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"] - if metatype == "families": - elements = self.gene_families - elif metatype == "genomes": - elements = self.organisms - elif metatype == "genes": - elements = self.genes - elif metatype == "RGPs": - elements = self.regions - elif metatype == "spots": - elements = self.spots - else: # metatype == "modules": - elements = self.modules - for elem in elements: + for elem in self.select_elem(metatype): yield elem.metadata - def get_elem_by_metadata(self, metatype: str, **kargs) -> Generator[ + def get_elem_by_metadata(self, metatype: str, **kwargs) -> Generator[ Union[GeneFamily, Gene, Organism, Region, Spot, Module], None, None]: - assert metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"] - if metatype == "families": - elements = self.gene_families - elif metatype == "genomes": - elements = self.organisms - elif metatype == "genes": - elements = self.genes - elif metatype == "RGPs": - elements = self.regions - elif metatype == "spots": - elements = self.spots - else: # metatype == "modules": - elements = self.modules - for element in elements: - if len(list(element.get_metadata(**kargs))) > 0: - yield element + """Get element in pangenome with metadata attribute expected + + :param metatype: Select to which pangenome element metadata + :param kwargs: attributes to identify metadata + + :return: Metadata element + """ + for elem in self.select_elem(metatype): + if len(list(elem.get_metadata_by_attribute(**kwargs))) > 0: + yield elem def get_elem_by_sources(self, source: List[str], metatype: str) -> Generator[ Union[GeneFamily, Gene, Organism, Region, Spot, Module], None, None]: @@ -671,18 +675,6 @@ def get_elem_by_sources(self, source: List[str], metatype: str) -> Generator[ :return: Gene families with the source """ assert metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"] - if metatype == "families": - elements = self.gene_families - elif metatype == "genomes": - elements = self.organisms - elif metatype == "genes": - elements = self.genes - elif metatype == "RGPs": - elements = self.regions - elif metatype == "spots": - elements = self.spots - else: # metatype == "modules": - elements = self.modules - for element in elements: - if element.get_source(source) is not None: - yield element + for elem in self.select_elem(metatype): + if elem.get_metadata_by_source(source) is not None: + yield elem diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index aa45dffa..869e9d3b 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -138,7 +138,7 @@ def launch(args: argparse.Namespace): input_organism = annotate_organism(org_name=args.organism_name, file_name=args.fasta_file, circular_contigs=[], tmpdir=args.tmpdir, code=args.translation_table, norna=annotate_params.norna, kingdom=annotate_params.kingdom, - overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure) + allow_overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure) if input_organism.number_of_genes() == 0: raise ValueError("No genes have been predicted in the input organism's FASTA file, making projection impossible.") @@ -183,7 +183,7 @@ def launch(args: argparse.Namespace): # then no spot will be found input_org_spots = {} else: - input_org_spots = predict_spots_in_input_organism(initial_spots=pangenome.spots, + input_org_spots = predict_spots_in_input_organism(initial_spots=list(pangenome.spots), initial_regions=pangenome.regions, input_org_rgps=input_org_rgps, multigenics=multigenics, output=output_dir, @@ -307,11 +307,12 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org try: gene_family = seqid_to_gene_family[gene_id] - gene_family.add_gene(gene) + gene_family.add(gene) except KeyError: - new_gene_family = pangenome.add_gene_family(gene_id) - new_gene_family.add_gene(gene) - new_gene_family.add_partition("Cloud") + new_gene_family = GeneFamily(pangenome.max_fam_id, gene_id) + pangenome.add_gene_family(new_gene_family) + new_gene_family.add(gene) + new_gene_family.partition = "Cloud" lonely_gene += 1 logging.getLogger().info(f"The input organism has {lonely_gene}/{input_organism.number_of_genes()} " @@ -366,15 +367,15 @@ def write_predicted_regions(regions: Set[Region], writer.writeheader() regions = sorted(regions, key=lambda x: ( - x.organism.name, x.contig.name, x.start)) + x.organism.name, x.contig.name, x.starter)) for region in regions: row = { "region": region.name, "organism": region.organism, "contig": region.contig, - "start": region.start, - "stop": region.stop, - "genes": len(region.genes), + "start": region.starter, + "stop": region.stopper, + "genes": len(region), "contigBorder": region.is_contig_border, "wholeContig": region.is_whole_contig } @@ -402,7 +403,7 @@ def write_rgp_to_spot_table(rgp_to_spots: Dict[Region, Set[str]], output: Path, writer.writeheader() regions = sorted(rgp_to_spots.keys(), key=lambda x: ( - x.organism.name, x.contig.name, x.start)) + x.organism.name, x.contig.name, x.starter)) for region in regions: row = { "region": region.name, @@ -649,7 +650,8 @@ def predict_spots_in_input_organism(initial_spots: List[Spot], initial_regions: graph_spot.nodes[node]["includes_RGPs_from_the_input_organism"] = True for spot in spots_of_the_cc: - spot.add_regions(input_rgps_of_the_cc) + for region in input_rgps_of_the_cc: + spot.add(region) input_rgp_to_spots.update( {rgp: spots_of_the_cc for rgp in input_rgps_of_the_cc}) @@ -705,7 +707,7 @@ def project_and_write_modules(pangenome: Pangenome, input_organism: Organism, modules_in_input_org.append(mod) completion = round( - len(input_organism.families & mod.families) / len(mod.families), 2) + len(set(input_organism.families) & set(mod.families)) / len(set(mod.families)), 2) fout.write( f"module_{mod.ID}\t{input_organism.name}\t{completion}\n") @@ -774,7 +776,7 @@ def parser_projection(parser: argparse.ArgumentParser): optional.add_argument('--coverage', required=False, type=restricted_float, default=0.8, help="min coverage percentage threshold") - optional.add_argument("--translation_table", required=False, default="11", + optional.add_argument("--translation_table", required=False, default=11, type=int, help="Translation table (genetic code) to use.") optional.add_argument("--use_pseudo", required=False, action="store_true", diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 29158290..5581e55c 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -4,11 +4,9 @@ # default libraries from __future__ import annotations import logging -from collections.abc import Iterable # installed libraries -from typing import Dict, Set - +from typing import Dict, Generator, List, Set import gmpy2 # local libraries @@ -19,122 +17,224 @@ class Region(MetaFeatures): """ - This class represent a region of genomic plasticity. - - :param region_id: identifier of the region + The 'Region' class represents a region of genomic plasticity. + Methods: + - 'genes': the property that generates the genes in the region as they are ordered in contigs. + - 'families': the property that generates the gene families in the region. + - 'Length': the property that gets the length of the region. + - 'organism': the property that gets the organism linked to the region. + - 'Contig': the property that gets the starter contig linked to the region. + - 'is_whole_contig': the property that indicates if the region is an entire contig. + - 'is_contig_border': the property that indicates if the region is bordering a contig. + - 'get_rnas': the method that gets the RNA in the region. + - 'Get_bordering_genes': the method that gets the bordered genes in the region. + + Fields: + - 'name': the name of the region. + - 'score': the score of the region. + - 'Starter': the first gene in the region. + - 'stopper': the last gene in the region. """ + id_counter = 0 + + def __init__(self, name: str): + """Constructor method - def __init__(self, region_id: str): + :param name: Name of the region + """ super().__init__() - self.genes = [] - self.name = region_id + self._genes_getter = {} + self.name = name self.score = 0 + self.starter = None + self.stopper = None + self.ID = Region.id_counter + Region.id_counter += 1 def __str__(self): return self.name - def __hash__(self): + def __repr__(self) -> str: + """Region representation + """ + return f"RGP name:{self.name}" + + def __hash__(self) -> int: + """Create a hash value for the region + """ return id(self) + + def __lt__(self, obj): + return self.ID < obj.ID + + def __gt__(self, obj): + return self.ID > obj.ID def __eq__(self, other: Region) -> bool: """ - Expects another Region type object. Will test whether two Region objects have the same gene families + Test whether two Region objects have the same gene families + + :param other: Another region to test equality of regions - :param other: Other region to test equality of region + :return: Equal or not - :return: equal or not + :raises TypeError: Try to compare a region with another type object """ if not isinstance(other, Region): raise TypeError(f"'Region' type object was expected, but '{type(other)}' type object was provided.") if [gene.family for gene in self.genes] == [gene.family for gene in other.genes]: return True - if [gene.family for gene in self.genes] == [gene.family for gene in other.genes[::-1]]: + if [gene.family for gene in self.genes] == [gene.family for gene in list(other.genes)[::-1]]: return True return False - def __len__(self): - return len(self.genes) + def __len__(self) -> int: + """Get the number of genes in the region + """ + return len(self._genes_getter) + + def __setitem__(self, position: int, gene: Gene): + """Set a gene by is position in the region + + :param position: Position of the gene in the contig + :param gene: Gene to add in the region - def __getitem__(self, index): - return self.genes[index] + :raises TypeError: Gene is not instance Gene + :raises Exception: Organism or contig of the gene is different from the region + :raises KeyError: Another gene already exists at the position + """ + if len(self) > 0: + if gene.organism != self.organism: + raise Exception(f"Gene {gene.name} is from a different organism than the first defined in RGP. " + f"That's not possible") + if gene.contig != self.contig: + raise Exception(f"Gene {gene.name} is from a different contig than the first defined in RGP. " + f"That's not possible") + if position in self._genes_getter and self[position] != gene: + raise KeyError("Another gene already exist at this position") + self._genes_getter[position] = gene + self.starter = self._genes_getter[min(self._genes_getter.keys())] + self.stopper = self._genes_getter[max(self._genes_getter.keys())] + gene.RGP = self - def append(self, gene: Gene): - """allowing only gene-class objects in a region + def __getitem__(self, position: int) -> Gene: + """Get the gene at the given position - :param gene: gene which will be added + :param position: Position of the gene - :raise TypeError: If gene is not Gene type raise TypeError + :return: Gene in the Region at the given position + + :raises KeyError: Gene at the given position does not exist """ + try: + return self._genes_getter[position] + except KeyError: + raise KeyError(f"There is no gene at position {position} in RGP {self.name}") - if isinstance(gene, Gene): - self.genes.append(gene) - gene.RGP.add(self) - else: - raise TypeError(f"Unexpected class / type for {type(gene)} when adding it to a RGP") + def __delitem__(self, position): + """Remove the gene at the given position - @property - def families(self) -> Set[GeneFamily]: - """Get the gene families in the RGP + :param position: Position of the gene + + :raises KeyError: Gene at the given position does not exist""" + try: + del self._genes_getter[position] + except KeyError: + raise KeyError(f"There is no gene at position {position} in RGP {self.name}") + + def add(self, gene: Gene): + """Add a gene to the region + + :param gene: Gene to add + """ + if not isinstance(gene, Gene): + raise TypeError(f"Unexpected class / type for {type(gene)} " + f"when adding it to a region of genomic plasticity") + if gene.position is None: + raise AttributeError(f'Gene {gene.name} is not fill with position') + self[gene.position] = gene + + def get(self, position: int) -> Gene: + """Get a gene by its position + + :param position: Position of the gene in the contig + + :return: Wanted gene - :return: Set of gene families + :raises TypeError: Position is not an integer """ - return {gene.family for gene in self.genes} + if not isinstance(position, int): + raise TypeError(f"Position to get gene must be an integer. The provided type was {type(position)}") + return self[position] + + def remove(self, position): + """Remove a gene by its position + + :param position: Position of the gene in the contig + + :raises TypeError: Position is not an integer + """ + if not isinstance(position, int): + raise TypeError(f"Position to get gene must be an integer. The provided type was {type(position)}") + del self[position] @property - def start(self) -> int: - """ Get RGP starting position + def genes(self) -> Generator[Gene, None, None]: + """Generate the gene as they are ordered in contigs - :return: Start position + :return: Genes in the region """ - return min(self.genes, key=lambda x: x.start).start + for gene in sorted(self._genes_getter.values(), key=lambda x: x.position): + yield gene - @property # TODO try to change start with this method - def start_gene(self) -> Gene: - """ Get RGP starting gene + @property + def families(self) -> Generator[GeneFamily, None, None]: + """Get the gene families in the RGP - :return: Start gene + :return: Gene families """ - return min(self.genes, key=lambda x: x.position) + for gene in self.genes: + yield gene.family @property - def stop_gene(self) -> Gene: - """ Get RGP stoping position + def number_of_families(self) -> int: + """Get the number of different gene families in the region - :return: Stoping position + :return: Number of families """ - return max(self.genes, key=lambda x: x.position) + return len(set(self.families)) @property - def stop(self): - """ Get RGP stoping position + def length(self): + """Get the length of the region - :return: Stop position + :return: Size of the region """ - return max(self.genes, key=lambda x: x.stop).stop + return self.stopper.stop - self.starter.start @property def organism(self) -> Organism: """ Get the Organism link to RGP - :return: Organism + :return: Organism corresponding to the region """ - return self.genes[0].organism + return self.starter.organism @property def contig(self) -> Contig: - """ Get the Contig link to RGP + """ Get the starter contig link to RGP - :return: Contig + :return: Contig corresponding to the region """ - return self.genes[0].contig + return self.starter.contig @property def is_whole_contig(self) -> bool: """Indicates if the region is an entire contig - :return: True if whole contig + :return: True if whole contig else False """ - if self.start_gene.position == 0 and self.stop_gene.position == len(self.contig.genes) - 1: + if self.starter.position == 0 and self.stopper.position == self.contig.number_of_genes - 1: return True return False @@ -142,65 +242,59 @@ def is_whole_contig(self) -> bool: def is_contig_border(self) -> bool: """Indicates if the region is bordering a contig - :return: True if bordering - """ - if len(self.genes) == 0: - raise Exception("Your region has no genes. Something wrong happenned.") - if (self.start_gene.position == 0 and not self.contig.is_circular) or \ - (self.stop_gene.position == len(self.contig.genes) - 1 and not self.contig.is_circular): - return True - return False + :return: True if bordering else False - def get_rnas(self) -> set: - """ Get RNA in region - - :return: Set of RNA + :raises AssertionError: No genes in the regions, it's not expected """ - rnas = set() - for rna in self.contig.RNAs: - if self.start < rna.start < self.stop: - rnas.add(rna) - return rnas + assert len(self) > 0, "Your region has no genes. Something wrong happenned." + + min_pos = min(self.contig.genes, key=lambda x: x.position).position + max_pos = max(self.contig.genes, key=lambda x: x.position).position + if not self.contig.is_circular: + if self.starter.position == min_pos or self.stopper.position == max_pos: + return True + return False - def get_bordering_genes(self, n: int, multigenics: set) -> list: + def get_bordering_genes(self, n: int, multigenics: set) -> List[List[Gene], List[Gene]]: """ Get the bordered genes in the region - :param n: number of genes to get + :param n: Number of genes to get :param multigenics: pangenome graph multigenic persistent families - :return: A list of bordering gene in start and stop position List[List[Start Gene], [Stop Gene]] + :return: A list of bordering genes in start and stop position """ + # TODO add Exception border = [[], []] - pos = self.start_gene.position + pos = self.starter.position init = pos while len(border[0]) < n and (pos != 0 or self.contig.is_circular): curr_gene = None - if pos == 0: + if pos == 0: # TODO change for variable to be more flexible if self.contig.is_circular: - curr_gene = self.contig.genes[-1] + curr_gene = self.contig[pos - 1] else: - curr_gene = self.contig.genes[pos - 1] + curr_gene = self.contig[pos - 1] if curr_gene is not None and curr_gene.family not in multigenics and \ curr_gene.family.named_partition == "persistent": border[0].append(curr_gene) pos -= 1 if pos == -1 and self.contig.is_circular: - pos = len(self.contig.genes) + pos = self.contig.number_of_genes if pos == init: break # looped around the contig - pos = self.stop_gene.position + pos = self.stopper.position init = pos - while len(border[1]) < n and (pos != len(self.contig.genes) - 1 or self.contig.is_circular): + while len(border[1]) < n and (pos != self.contig.number_of_genes - 1 or self.contig.is_circular): curr_gene = None - if pos == len(self.contig.genes) - 1: + if pos == self.contig.number_of_genes - 1: if self.contig.is_circular: - curr_gene = self.contig.genes[0] + curr_gene = self.contig[0] else: - curr_gene = self.contig.genes[pos + 1] + curr_gene = self.contig[pos + 1] if curr_gene is not None and curr_gene.family not in multigenics: border[1].append(curr_gene) pos += 1 - if pos == len(self.contig.genes) and self.contig.is_circular: + if pos == self.contig.number_of_genes and self.contig.is_circular: pos = -1 if pos == init: break # looped around the contig @@ -209,68 +303,166 @@ def get_bordering_genes(self, n: int, multigenics: set) -> list: class Spot(MetaFeatures): """ - This class represent a hotspot. - - :param spot_id: identifier of the spot + The 'Spot' class represents a region of genomic plasticity. + Methods: + - 'regions': the property that generates the regions in the spot. + - 'families': the property that generates the gene families in the spot. + - 'spot_2_families': add to Gene Families a link to spot. + - 'borders': Extracts all the borders of all RGPs belonging to the spot + - 'get_uniq_to_rgp': Get dictionnary with a representing RGP as key, and all identical RGPs as value + - 'get_uniq_ordered_set': Get an Iterable of all the unique syntenies in the spot + - 'get_uniq_content': Get an Iterable of all the unique rgp (in terms of gene family content) in the spot + - 'count_uniq_content': Get a counter of uniq RGP and number of identical RGP (in terms of gene family content) + - 'count_uniq_ordered_set': Get a counter of uniq RGP and number of identical RGP (in terms of synteny content) + + Fields: + - 'ID': Identifier of the spot """ - def __init__(self, spot_id): + def __init__(self, spot_id: int): + """Constructor method + + :param spot_id: Identifier of the spot + """ + if not isinstance(spot_id, int): + raise TypeError(f"Spot identifier must be an integer. Given type is {type(spot_id)}") super().__init__() self.ID = spot_id - self.regions = set() + self._region_getter = {} self._uniqOrderedSet = {} - self._compOrderedSet = False self._uniqContent = {} - self._compContent = False + + def __repr__(self) -> str: + """Spot representation + """ + return f"Spot {self.ID} - #RGP: {len(self)}" def __str__(self): - return f'spot_{str(self.ID)}' + """String representation of the spot + """ + return f"spot_{self.ID}" - @property - def families(self) -> set: - """Get the gene families in the RGP + def __setitem__(self, name: str, region: Region): + """Set the region belonging to the spot + + :param name: Name of the region + :param region: Region to add in the spot - :return: Set of gene families + :raises KeyError: Name of the region is already in the spot for a different region """ + if name in self._region_getter and self[name] != region: + raise KeyError("A Region with the same name already exist in spot") + self._region_getter[name] = region - union = set() - for region in self.regions: - union |= region.families - return union + def __getitem__(self, name) -> Region: + """Get the region with the given name + + :param name: Name of the wanted region - def add_regions(self, regions): + :return: Region in the spot for the given name + + :raises KeyError: Name does not exist in the spot + :raises TypeError: Name is not a string + """ + if not isinstance(name, str): + raise TypeError(f"Name of the region must be a string. The provided type was {type(name)}") + try: + return self._region_getter[name] + except KeyError: + raise KeyError(f"Region with {name} does not exist in spot") + + def __delitem__(self, name): + """Delete the region for the given name + + :param name: Name of the wanted region + + :raises KeyError: Name does not exist in the spot + :raises TypeError: Name is not a string """ - Adds region(s) contained in an Iterable to the spot which all have the same bordering persistent genes - provided with 'borders' + if not isinstance(name, str): + raise TypeError(f"Name of the region must be a string. The provided type was {type(name)}") + try: + del self._region_getter[name] + except KeyError: + raise KeyError(f"Region with {name} does not exist in spot") - :param regions: Iterable list of RGP to add to spot + def __len__(self) -> int: + """Get the number of regions in the spot """ - if isinstance(regions, Iterable): - for region in regions: - self.add_region(region) - else: - raise Exception("The provided 'regions' variable was not an Iterable") + return len(self._region_getter) + + def add(self, region: Region): + """Add a region to the spot. + Alias more readable for setitem + + :param region: Region to add in the spot + + :raises TypeError: Region is not an instance Region + """ + if not isinstance(region, Region): + raise TypeError(f"A Region object is expected to be added to the spot. find type is {type(region)}") + self[region.name] = region + + def get(self, name: str) -> Region: + """Get a region by its name. + Alias more readable for getitem + + :param name: Name of the region + + :return: Wanted region + """ + return self[name] + + def remove(self, name: str): + """Remove a region by its name. + Alias more readable for delitem + + :param name: Name of the region + """ + del self[name] + + @property + def regions(self) -> Generator[Region, None, None]: + """Generates the regions in the spot + + :return: Regions in the spot + """ + for region in self._region_getter.values(): + yield region + + @property + def families(self) -> Generator[GeneFamily, None, None]: + """Get the gene families in the RGP - def add_region(self, region): + :return: Family in the spot """ - Add one RGP to the spot + families = set() + for region in self.regions: + for family in region.families: + if family not in families: + families.add(family) + yield family + + @property + def number_of_families(self) -> int: + """Get the number of different families in the spot - :param region: RGP to add to spot + :return: Number of families """ - if isinstance(region, Region): - self.regions.add(region) + return len({family for region in self.regions for family in region.families}) def spot_2_families(self): - """Add to Gene Families a link to spot""" + """Add to Gene Families a link to spot + """ for family in self.families: - family.spot.add(self) + family.add_spot(self) - def borders(self, set_size: int, multigenics): + def borders(self, set_size: int, multigenics) -> List[List[int, List[GeneFamily], List[GeneFamily]]]: """ Extracts all the borders of all RGPs belonging to the spot - :param set_size: number of genes to get + :param set_size: Number of genes to get :param multigenics: pangenome graph multigenic persistent families - :return: families that bordering spot + :return: Families that bordering spot """ all_borders = [] for rgp in self.regions: @@ -291,7 +483,8 @@ def borders(self, set_size: int, multigenics): return family_borders def _mk_uniq_ordered_set_obj(self): - """cluster RGP into groups that have an identical synteny""" + """cluster RGP into groups that have an identical synteny + """ for rgp in self.regions: z = True for seen_rgp in self._uniqOrderedSet: @@ -301,52 +494,51 @@ def _mk_uniq_ordered_set_obj(self): if z: self._uniqOrderedSet[rgp] = {rgp} - def _mk_uniq_content(self): - """cluster RGP into groups that have identical gene content""" - for rgp in self.regions: - z = True - for seen_rgp in self._uniqContent: - if rgp.families == seen_rgp.families: - z = False - self._uniqContent[seen_rgp].add(rgp) - if z: - self._uniqContent[rgp] = {rgp} - - def _get_content(self): - """Creates the _uniqContent object if it was never computed. Return it in any case - - :return: RGP groups that have identical gene content - """ - if not self._compContent: - self._mk_uniq_content() - self._compContent = True - return self._uniqContent - - def _get_ordered_set(self): + def _get_ordered_set(self) -> Dict[Region, Set[Region]]: """ Creates the _uniqSyn object if it was never computed. Return it in any case :return: RGP groups that have an identical synteny """ - if not self._compOrderedSet: + if len(self._uniqOrderedSet) == 0: self._mk_uniq_ordered_set_obj() - self._compOrderedSet = True return self._uniqOrderedSet - def get_uniq_to_rgp(self) -> dict: - """ Get dictionnary with a representing RGP as key, and all identical RGPs as value + def get_uniq_to_rgp(self) -> Dict[Region, Set[Region]]: + """ Get dictionnary with a representing RGP as the key, and all identical RGPs as value - :return: Dictionnary with a representing RGP as key, and all identical RGPs as value + :return: Dictionnary with a representing RGP as the key, and set of identical RGPs as value """ return self._get_ordered_set() - def get_uniq_ordered_set(self): + def get_uniq_ordered_set(self) -> Set[Region]: """Get an Iterable of all the unique syntenies in the spot :return: Iterable of all the unique syntenies in the spot """ return set(self._get_ordered_set().keys()) - def get_uniq_content(self): + def _mk_uniq_content(self): + """cluster RGP into groups that have identical gene content + """ + for rgp in self.regions: + z = True + for seen_rgp in self._uniqContent: + if rgp.families == seen_rgp.families: + z = False + self._uniqContent[seen_rgp].add(rgp) + if z: + self._uniqContent[rgp] = {rgp} + + def _get_content(self) -> Dict[Region, Set[Region]]: + """Creates the _uniqContent object if it was never computed. + + :return: RGP groups that have identical gene content + """ + if len(self._uniqContent) == 0: + self._mk_uniq_content() + return self._uniqContent + + def get_uniq_content(self) -> Set[Region]: """ Get an Iterable of all the unique rgp (in terms of gene family content) in the spot :return: Iterable of all the unique rgp (in terms of gene family content) in the spot @@ -357,7 +549,7 @@ def count_uniq_content(self) -> dict: """ Get a counter of uniq RGP and number of identical RGP (in terms of gene family content) - :return: dictionary with a representative rgp as key and number of identical rgp as value + :return: Dictionary with a representative rgp as the key and number of identical rgp as value """ return dict([(key, len(val)) for key, val in self._get_content().items()]) @@ -365,52 +557,149 @@ def count_uniq_ordered_set(self): """ Get a counter of uniq RGP and number of identical RGP (in terms of synteny content) - :return: dictionary with a representative rgp as key and number of identical rgp as value + :return: Dictionary with a representative rgp as the key and number of identical rgp as value """ return dict([(key, len(val)) for key, val in self._get_ordered_set().items()]) class Module(MetaFeatures): - """ - This class represent a hotspot. + """The `Module` class represents a module in a pangenome analysis. + + The `Module` class has the following attributes: + - `ID`: An integer identifier for the module. + - `bitarray`: A bitarray representing the presence/absence of the gene families in an organism. - :param module_id: identifier of the module - :param families: Set of families which define the module + The `Module` class has the following methods: + - `families`: Returns a generator that yields the gene families in the module. + - `mk_bitarray`: Generates a bitarray representing the presence/absence of the gene families in an organism using the provided index. """ def __init__(self, module_id: int, families: set = None): + """Constructor method + + :param module_id: Module identifier + :param families: Set of families which define the module """ - 'core' are gene families that define the module. - 'associated_families' are gene families that you believe are associated to the module in some way, - but do not define it. - """ + if not isinstance(module_id, int): + raise TypeError(f"Module identifier must be an integer. Given type is {type(module_id)}") super().__init__() self.ID = module_id - self._families = set() - if families is not None: - if not all(isinstance(fam, GeneFamily) for fam in families): - raise Exception("You provided elements that were not GeneFamily object. " - "Modules are only made of GeneFamily") - self._families |= set(families) + self._families_getter = {family.name: family for family in families} if families is not None else {} self.bitarray = None - @property - def families(self) -> Set[GeneFamily]: - # TODO made as generator - return self._families + def __repr__(self) -> str: + """Module representation + """ + return f"Module {self.ID} - #Families: {len(self)}" - def __str__(self): - return f'module_{str(self.ID)}' + def __str__(self) -> str: + """String representation of the module + """ + return f"module_{self.ID}" + + def __hash__(self) -> int: + """Create a hash value for the module + """ + return id(self) + + def __len__(self) -> int: + """Get the number of families in the module + """ + return len(self._families_getter) + + def __eq__(self, other: Module) -> bool: + """ + Test whether two Module objects have the same gene families + + :param other: Another module to test equality + + :return: Equal or not + + :raises TypeError: Try to compare a module with another type object + """ + if not isinstance(other, Module): + raise TypeError(f"Another module is expected to be compared to the first one. You give a {type(other)}") + return set(self.families) == set(other.families) + + def __setitem__(self, name: str, family: GeneFamily): + """Set a gene family in the module + + :param name: Name of the family + :param family: Gene family belonging to the module + + :raises TypeError: Family is not instance GeneFamily + :raises KeyError: Another family with the same name already exists in the module + """ + if name in self._families_getter and self[name] != family: + raise KeyError("A different gene family with the same name already exist in the module") + self._families_getter[name] = family + family.add_module(self) + + def __getitem__(self, name) -> GeneFamily: + """Get the gene family for the given name in the module + + :param name: Name of the gene family + + :return: Gene family with the given name + + :raises KeyError: Family with the given name does not exist in the module + """ + try: + return self._families_getter[name] + except KeyError: + raise KeyError(f"There isn't gene family with the name {name} in the module") + + def __delitem__(self, name): + """Remove the gene family for the given name in the module + + :param name: Name of the gene family - def add_family(self, family: GeneFamily): + :raises KeyError: Family with the given name does not exist in the module """ - Add a family to the module + try: + fam = self._families_getter[name] + except KeyError: + raise KeyError(f"There isn't gene family with the name {name} in the module") + else: + del self._families_getter[name] + fam._modules.remove(self) + + def add(self, family: GeneFamily): + """Add a family to the module. + Alias more readable for setitem + + :param family: Region to add in the spot - :param family: the family that will ba added to the module + :raises TypeError: Region is not an instance Region """ if not isinstance(family, GeneFamily): - raise Exception("You did not provide a GenFamily object. Modules are only made of GeneFamily") - family.modules.add(self) - self._families.add(family) + raise TypeError(f"A gene family is expected to be added to module. Given type was {type(family)}") + self[family.name] = family + + def get(self, name: str) -> GeneFamily: + """Get a family by its name. + Alias more readable for getitem + + :param name: Name of the family + + :return: Wanted family + """ + return self[name] + + def remove(self, name: str): + """Remove a family by its name. + Alias more readable for delitem + + :param name: Name of the family + """ + del self[name] + + @property + def families(self) -> Generator[GeneFamily, None, None]: + """Generator of the family in the module + + :return: Families belonging to the module + """ + yield from self._families_getter.values() def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): """Produces a bitarray representing the presence / absence of families in the organism using the provided index @@ -445,29 +734,103 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): class GeneContext: """ - A class used to represent a gene context + The GeneContext class represents a gene context, which is a collection of gene families related to a specific genomic context. - :param gc_id : identifier of the Gene context - :param families: Gene families related to the GeneContext - """ + Methods + families: Generator that yields all the gene families in the gene context. + Fields + ID: The identifier of the gene context. + """ def __init__(self, gc_id: int, families: set = None): + """Constructor method + :param gc_id : Identifier of the Gene context + :param families: Gene families related to the GeneContext + """ + if not isinstance(gc_id, int): + raise TypeError(f"Gene context identifier must be an integer. Given type is {type(gc_id)}") self.ID = gc_id - self.families = set() - if families is not None: - if not all(isinstance(fam, GeneFamily) for fam in families): - raise Exception("You provided elements that were not GeneFamily object." - " GeneContext are only made of GeneFamily") - self.families |= set(families) + self._families_getter = {family.name: family for family in families} if families is not None else {} - def __str__(self): + def __repr__(self) -> str: + """Context representation + """ + return f"Context {self.ID} - #Families: {len(self)}" + + def __str__(self) -> str: + """String representation of the gene context + """ return f'GC_{str(self.ID)}' - def add_family(self, family: GeneFamily): + def __hash__(self) -> int: + """Create a hash value for the region + """ + return id(self) + + def __len__(self) -> int: + """Get the number of families in the context + """ + return len(self._families_getter) + + def __eq__(self, other: GeneContext) -> bool: + """ + Test whether two gene context objects have the same gene families + + :param other: Another gene context to test equality + + :return: Equal or not + + :raises TypeError: Try to compare a gene context with another type object """ - Allow to add one family in the GeneContext - :param family: family to add + if not isinstance(other, GeneContext): + raise TypeError(f"Another context is expected to be compared to the first one. You give a {type(other)}") + return set(self.families) == set(other.families) + + def __setitem__(self, name, family): + """Set a gene family in the gene context + + :param name: Name of the family + :param family: Gene family belonging to the context + + :raises TypeError: Family is not instance GeneFamily + :raises KeyError: Another family with the same name already exists in the context """ if not isinstance(family, GeneFamily): - raise Exception("You did not provide a GenFamily object. Modules are only made of GeneFamily") - self.families.add(family) + raise TypeError(f"A gene family is expected to be added to gene context. Given type was {type(family)}") + if name in self._families_getter and self[name] != family: + raise KeyError("A different gene family with the same name already exist in the gene context") + self._families_getter[name] = family + + def __getitem__(self, name) -> GeneFamily: + """Get the gene family for the given name in the context + + :param name: Name of the gene family + + :return: Gene family with the given name + + :raises KeyError: Family with the given name does not exist in the context + """ + try: + return self._families_getter[name] + except KeyError: + raise KeyError(f"There isn't gene family with the name {name} in the gene context") + + def __delitem__(self, name): + """Remove the gene family for the given name in the context + + :param name: Name of the gene family + + :raises KeyError: Family with the given name does not exist in the context + """ + try: + del self._families_getter[name] + except KeyError: + raise KeyError(f"There isn't gene family with the name {name} in the gene context") + + @property + def families(self) -> Generator[GeneFamily, None, None]: + """Generator of the family in the context + + :return: Gene families belonging to the context + """ + yield from self._families_getter.values() diff --git a/ppanggolin/utility/utils.py b/ppanggolin/utility/utils.py index 41cce5e2..accea93a 100644 --- a/ppanggolin/utility/utils.py +++ b/ppanggolin/utility/utils.py @@ -132,7 +132,7 @@ def launch_default_config(args: argparse.Namespace): """ initial_command = args.default_config - if os.path.exists(args.output) and not args.force: + if args.output.exists() and not args.force: raise FileExistsError(f"{args.output} already exists. Use -f if you want to overwrite it.") ignored_params = ['config', 'help'] diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py index 6fa51524..4946abdf 100644 --- a/ppanggolin/workflow/all.py +++ b/ppanggolin/workflow/all.py @@ -12,7 +12,7 @@ # local libraries from ppanggolin.pangenome import Pangenome from ppanggolin.utils import mk_file_name, mk_outdir, check_option_workflow, restricted_float -from ppanggolin.annotate.annotate import annotate_pangenome, read_annotations, get_gene_sequences_from_fastas +from ppanggolin.annotate.annotate import annotate_pangenome, read_annotations, get_gene_sequences_from_fastas, check_annotate_args from ppanggolin.cluster.cluster import clustering, read_clustering from ppanggolin.graph.makeGraph import compute_neighbors_graph from ppanggolin.nem.rarefaction import make_rarefaction_curve @@ -41,6 +41,7 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, """ check_option_workflow(args) + check_annotate_args(args) pangenome = Pangenome() filename = mk_file_name(args.basename, args.output, args.force) @@ -87,7 +88,7 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, procedure=args.annotate.prodigal_procedure, translation_table=args.annotate.translation_table, kingdom=args.annotate.kingdom, norna=args.annotate.norna, - overlap=args.annotate.allow_overlap) + allow_overlap=args.annotate.allow_overlap) anno_time = time.time() - start_anno start_writing = time.time() @@ -185,8 +186,8 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, spot_time += time.time() - start_spot_drawing if args.draw.tile_plot: - if 1 < len(pangenome.organisms) < 5000: - nocloud = args.draw.nocloud if len(pangenome.organisms) < 500 else True + if 1 < pangenome.number_of_organisms < 5000: + nocloud = args.draw.nocloud if pangenome.number_of_organisms < 500 else True draw_tile_plot(pangenome, args.output, nocloud=nocloud, disable_bar=args.disable_prog_bar) else: logging.getLogger("PPanGGOLiN").warning( diff --git a/requirements.txt b/requirements.txt index 61774df7..2b72f9a1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ prodigal>=2.6.3 aragorn>=1.2.41 infernal>=1.1.4 mmseqs2>=13.45111 -networkx>=2.3 +networkx>=2.7 dataclasses>=0.8 scipy>=1.7.3 plotly>=4.14.3 diff --git a/tests/genome/test_Contig.py b/tests/genome/test_Contig.py deleted file mode 100644 index 6d7c79aa..00000000 --- a/tests/genome/test_Contig.py +++ /dev/null @@ -1,79 +0,0 @@ -#! /usr/bin/env python3 -import random - -import pytest - -from ppanggolin.genome import Contig, Gene, RNA - - -@pytest.fixture() -def o_ctg(): - return Contig("toto") - - -def test_cstr(): - name = 4 - o_ctg = Contig(name) - assert isinstance(o_ctg, Contig) - for attr in "name", "is_circular", "RNAs": - assert hasattr(o_ctg, attr) - assert o_ctg.name == name - assert o_ctg.is_circular is False - assert o_ctg.RNAs == set() - - o_ctg = Contig(name, True) - assert o_ctg.is_circular is True - - -def test_str(): - name = "ppoiu" - o_ctg = Contig(name) - assert str(o_ctg) == name - - -def test_add_rna(o_ctg): - with pytest.raises(TypeError): - o_ctg.add_rna(33) - - l_rnas = [] - for i in "abdc": - o_rna = RNA(i) - o_ctg.add_rna(o_rna) - l_rnas.append(o_rna) - assert o_ctg.RNAs == set(l_rnas) - - -@pytest.fixture() -def l_genes(): - l_genes = [] - for i in range(6, -1, -1): # Create 7 Gene - o_gene = Gene(i) - o_gene.fill_annotations(start=i*10, stop=i*10 - 1, strand='+', position=i) - l_genes.append(o_gene) - - return l_genes - - -def test_add_gene(o_ctg, l_genes): - with pytest.raises(TypeError): - o_ctg.add_gene(33) - - # gene must have a position before beeing added. - with pytest.raises(TypeError): - o_ctg.add_gene(Gene(33)) - - for o_gene in l_genes: - o_ctg.add_gene(o_gene) - - assert o_ctg.genes == sorted(l_genes, key=lambda x: x.position) - - -def test_iterator_behavior(o_ctg, l_genes): - # FIXME: is there a better way to check this ? - assert iter(o_ctg) - - for o_gene in l_genes: - o_ctg.add_gene(o_gene) - - l_ = [o_gene for o_gene in o_ctg] - assert l_ == sorted(l_genes, key=lambda x: x.start) diff --git a/tests/genome/test_Feature.py b/tests/genome/test_Feature.py deleted file mode 100644 index 37bb93ef..00000000 --- a/tests/genome/test_Feature.py +++ /dev/null @@ -1,70 +0,0 @@ -#! /usr/bin/env python3 - -import pytest - -from ppanggolin.genome import Feature - - -def test_cstr(): - identifier = 4 - o_feature = Feature(identifier) - assert isinstance(o_feature, Feature) - for attr in "ID", "is_fragment", "type": - assert hasattr(o_feature, attr) - assert o_feature.ID == identifier - assert o_feature.is_fragment is False - assert o_feature.type == "" - - -@pytest.fixture() -def o_feature(): - return Feature(4) - - -def test_fill_annotations(o_feature): - start, stop = 1, 9 - strand = "plus" - o_feature.fill_annotations(start, stop, strand) - for attr in 'start', 'stop', 'strand', \ - 'type', 'product', 'name': - assert hasattr(o_feature, attr) - assert o_feature.start == start - assert o_feature.stop == stop - assert o_feature.strand == strand - assert o_feature.type == '' - assert o_feature.name == '' - assert o_feature.product == '' - - gene_type = "inconnu" - name = "Eugène" - product = "va savoir" - o_feature.fill_annotations(start, stop, strand, gene_type, name, product) - assert o_feature.type == gene_type - assert o_feature.name == name - assert o_feature.product == product - - # what if start or stop < 0 ? - # stop < start - # start/stop cannot int() ? - # position not int - - -def test_fill_parents(o_feature): - org = "toto" - ctg = 99 - o_feature.fill_parents(org, ctg) - for attr in 'organism', 'contig': - assert hasattr(o_feature, attr) - assert o_feature.organism == org - assert o_feature.contig == ctg - - -def test_add_dna(o_feature): - dna = "test adn" - o_feature.add_dna(dna) - assert hasattr(o_feature, 'dna') - o_feature.dna = dna - - dna = 123 - with pytest.raises(TypeError): - o_feature.add_dna(dna) diff --git a/tests/genome/test_Gene.py b/tests/genome/test_Gene.py deleted file mode 100644 index 45e02713..00000000 --- a/tests/genome/test_Gene.py +++ /dev/null @@ -1,57 +0,0 @@ -#! /usr/bin/env python3 - -import pytest - -from ppanggolin.genome import Feature, Gene - - -def test_cstr(): - """ By checking o_gene is a Feature, I rely on Feature tests.""" - identifier = 4 - o_gene = Gene(identifier) - assert isinstance(o_gene, Feature) - assert isinstance(o_gene, Gene) - - for attr in "position", "family": - assert hasattr(o_gene, attr) - assert o_gene.position is None - assert o_gene.family is None - - -def test_str(): - identifier = "un truc" - o_gene = Gene(identifier) - assert str(o_gene) == identifier - - -@pytest.fixture() -def o_gene(): - return Gene(4) - - -def test_fill_annotations_defaults(o_gene): - o_gene.fill_annotations(start=1, stop=9, strand='+') - for attr in "position", "genetic_code": - assert hasattr(o_gene, attr) - - assert o_gene.position is None - assert o_gene.genetic_code == 11 - - -def test_fill_annotations(o_gene): - position = 44 - genetic_code = 11 - o_gene.fill_annotations(start=1, stop=9, strand='+', position=44, genetic_code=11) - assert o_gene.position == position - assert o_gene.genetic_code == genetic_code - - -def test_add_protein_error(o_gene): - with pytest.raises(TypeError): - o_gene.add_protein(42) - - -def test_add_protein(o_gene): - prot = "une jolie protéïne, même avec des caractères bizarres ;)" - o_gene.add_protein(prot) - assert o_gene.protein == prot diff --git a/tests/genome/test_Organism.py b/tests/genome/test_Organism.py deleted file mode 100644 index 18c7a2cf..00000000 --- a/tests/genome/test_Organism.py +++ /dev/null @@ -1,88 +0,0 @@ -#! /usr/bin/env python3 - -import pytest -from random import randint - -from ppanggolin.genome import Contig, Gene, Organism - - -def test_cstr(): - name = 4 - o_org = Organism(name) - assert isinstance(o_org, Organism) - assert hasattr(o_org, "name") - assert o_org.name == name - - -def test_str(): - name = "ppoiu" - o_org = Organism(name) - assert str(o_org) == name - - -@pytest.fixture() -def o_org(): - return Organism("toto") - - -def test_get_or_add_contig(o_org): - o_ctg = o_org.get_contig('i') - assert isinstance(o_ctg, Contig) - - -@pytest.fixture() -def t_filled_org(o_org): - n = 0 - for k in "azerty'": - o_ctg = o_org.get_contig(k) - for i in range(randint(0, 5)): - o_gene = Gene(k + "-" + str(i)) - o_gene.fill_annotations(6, 1, k, position=i) - o_ctg.add_gene(o_gene) - n += 1 - - return o_org, n - - -def test_families(t_filled_org): - o_filled_org, _ = t_filled_org - - # families are never set - assert o_filled_org.families == {None} - - -def test_number_of_genes(t_filled_org): - o_filled_org, n = t_filled_org - - assert o_filled_org.number_of_genes() == n - - -def get_genes(): - for i in range(randint(0, 5)): - o_gene = Gene(str(i)) - start = randint(0, 100) - stop = randint(0, 100) - o_gene.fill_annotations(start, stop, 'x', position=i) - yield o_gene - - -def test_contigs(o_org): - l_contigs = [] - for k in "azer'": - o_ctg = o_org.get_contig(k) - for o_gene in get_genes(): - o_ctg.add_gene(o_gene) - l_contigs.append(o_ctg) - - assert list(o_org.contigs) == l_contigs - - -def test_genes(o_org): - o_ctg = o_org.get_contig("scrap") - for o_gene in get_genes(): - o_ctg.add_gene(o_gene) - - assert list(o_org.genes) == o_ctg.genes - - # FIXME: find a way to test when several contigs. - # => order of contig is not predictable. diff --git a/tests/region/test_Region.py b/tests/region/test_Region.py deleted file mode 100644 index 3fca27ae..00000000 --- a/tests/region/test_Region.py +++ /dev/null @@ -1,203 +0,0 @@ -#! /usr/bin/env python3 - -import pytest - -from ppanggolin.region import Region -from ppanggolin.geneFamily import GeneFamily -from ppanggolin.genome import Gene, Contig, Organism, RNA - - -# ================================================ -def test_cstr(): - identifier = 4 - o_region = Region(identifier) - assert isinstance(o_region, Region) - for attr in "genes", "name", "score": - assert hasattr(o_region, attr) - - assert o_region.score == 0 - assert o_region.name == identifier - assert o_region.genes == [] - - -# ================================================ -@pytest.fixture -def o_region(): - return Region(4) - - -@pytest.fixture -def o_org(): - return Organism("toto") - - -@pytest.fixture -def o_contig(): - return Contig(1) - - -@pytest.fixture -def o_rna(o_contig): - o_rna = RNA("Ah") - o_rna.fill_annotations(35, 45, "-") - o_contig.add_rna(o_rna) - return o_rna - - -@pytest.fixture -def l_genes(o_org, o_contig): - """ creates a small gene set for testing. - - returns a list of 4 genes that belongs - to the same contig and the same organism.""" - l_genes = [] - c = 10 - for i, gene_id in enumerate([ - "toto", "tata", "titi", "tutu", - "lolo", "lala", "lili", "lulu", - ]): - gene = Gene(gene_id) - gene.fill_annotations(c, c + 30, "+", position=i) - gene.fill_parents(o_org, o_contig) - o_contig.add_gene(gene) - gene.family = GeneFamily(i, gene_id) - gene.family.add_partition("c-cloud") - l_genes.append(gene) - c += 35 - return l_genes - - -# ================================================ -def test_append(l_genes, o_region): - for gene in l_genes: - o_region.append(gene) - - assert set(o_region.genes) == set(l_genes) - - -def test_append__error(o_region): - """append should raise a TypeError is used with non Gene param.""" - with pytest.raises(TypeError): - o_region.append(42) - - -def test_properties(l_genes, o_region, o_org, o_contig): - """All properties expect a region with genes.""" - s_families = set() - for gene in l_genes: - o_region.append(gene) - s_families.add(gene.family) - - # checking properties sanity - assert o_region.start == o_region.start_gene.start - assert o_region.stop == o_region.stop_gene.stop - assert o_region.organism == o_org - assert o_region.families == s_families - assert o_region.contig == o_contig - assert o_region.is_whole_contig is True - assert o_region.is_contig_border is True # first contig gene is in the region - - # remove the first gene of the contig - o_region.genes.pop(0) - assert o_region.is_contig_border is True # last contig gene is in the region - - # remove the last gene of the contig - # => the whole contig is not in the Region anymore - o_region.genes.pop() - assert o_region.is_whole_contig is False - assert o_region.is_contig_border is False - - -def test_is_contig_border(o_region): - """is_contig_border raise an exception - when the region contain no genes. - """ - with pytest.raises(Exception): - o_region.is_contig_border - - -def test_get_rnas(o_rna, o_region, l_genes): - for gene in l_genes: - o_region.append(gene) - assert set(o_region.get_rnas()) == {o_rna} - - -def test_hash(o_region): - """ a hash function returns an integer""" - # the same int if called twice on the same object - h = hash(o_region) - assert isinstance(h, int) - assert h == hash(o_region) - - # different ints if called on objects representing the same entity - name = "charming" - assert hash(Region(name)) != hash(Region(name)) - - -def test_equality(o_region, l_genes): - """2 regions are equals if they contain the same list of genes.""" - for gene in l_genes: - o_region.append(gene) - - # not the same list => False - o_other = Region("other") - assert o_region != o_other - - # the exact same list => True - o_other = Region("other") - for gene in l_genes: - o_other.append(gene) - assert o_region == o_other - - # the same list in reverse order => True - o_other = Region("other") - for gene in reversed(l_genes): - o_other.append(gene) - assert o_region == o_other - - -def test_equality__error(o_region): - """equality raises error if not compared to another Region""" - with pytest.raises(TypeError): - o_region == 42 - - -def test_len(o_region, l_genes): - assert 0 == len(o_region) - - for gene in l_genes: - o_region.append(gene) - assert len(l_genes) == len(o_region) - - -def test_get_item(o_region, l_genes): - with pytest.raises(IndexError): - o_region[1] - - for gene in l_genes: - o_region.append(gene) - assert o_region[2] == l_genes[2] - - -def test_get_bordering_genes(o_region, l_genes): - # return at most n-1 genes not in multigenics families - # nor in family with persistent partition. - - print("\n") - for gene in l_genes: - o_region.append(gene) - - l_first, l_last = o_region.get_bordering_genes(0, ['f1', 'f2']) - assert [] == l_first - assert [] == l_last - - # line 101 & 125 != while condition. => unreachable lines. - # return nothing if is_contig_border - l_first, l_last = o_region.get_bordering_genes(2, ['f1', 'f2']) - assert [] == l_first - assert [] == l_last - - # remove first and last gene - o_region.genes.pop(0) - o_region.genes.pop() - o_region.get_bordering_genes(4, ['f1', 'f2']) diff --git a/tests/region/test_rgp_cluster.py b/tests/region/test_rgp_cluster.py new file mode 100644 index 00000000..d6913d30 --- /dev/null +++ b/tests/region/test_rgp_cluster.py @@ -0,0 +1,225 @@ +#! /usr/bin/env python3 +# coding: utf8 + +import pytest +from random import randint +from typing import Generator, Set +from ppanggolin.RGP import rgp_cluster +from ppanggolin.genome import Gene, Contig, Organism +from ppanggolin.geneFamily import GeneFamily +from ppanggolin.region import Region +from ppanggolin.RGP.rgp_cluster import IdenticalRegions + + +@pytest.fixture +def genes() -> Generator[Set[Gene], None, None]: + """Create a set of genes to fill gene families + """ + organism = Organism("organism") + contig = Contig("contig") + genes = set() + for i in range(0, randint(11, 20)): + gene = Gene(f"gene_{str(i)}") + gene.fill_annotations(start=10 * i + 1, stop=10 * (i + 1), strand='+', position=i, genetic_code=4) + gene.fill_parents(organism, contig) + contig[gene.start] = gene + genes.add(gene) + yield genes + + +@pytest.fixture +def families(genes) -> Generator[Set[GeneFamily], None, None]: + """Create a set of gene families fill with genes to test edges + """ + families = set() + genes = list(genes) + nb_families = randint(9, 20) + nb_genes_per_family = len(genes) // nb_families + idx_fam = 1 + while idx_fam < nb_families: + family = GeneFamily(idx_fam, f"family_{idx_fam}") + idx_genes = 0 + while idx_genes < nb_genes_per_family: + gene = genes[(idx_fam - 1) * nb_genes_per_family + idx_genes] + family.add(gene) + gene.family = family + idx_genes += 1 + families.add(family) + idx_fam += 1 + # last family fill with all the gene left + family = GeneFamily(idx_fam, f"family_{idx_fam}") + idx_genes = (idx_fam - 1) * nb_genes_per_family + while idx_genes < len(genes): + gene = genes[idx_genes] + family.add(gene) + gene.family = family + idx_genes += 1 + families.add(family) + yield families + + +@pytest.fixture +def identical_rgps(genes, families) -> Generator[Set[Region], None, None]: + """Create a set of identical rgps + """ + identical_rgps = set() + for i in range(1, randint(6, 21)): + rgp = Region(f"RGP_{i}") + # the three rgp have the gene content. + # in terms of family they are identical + for gene in genes: + rgp[gene.position] = gene + identical_rgps.add(rgp) + yield identical_rgps + + +class TestIdenticalRegions: + def test_init_with_valid_inputs(self, identical_rgps, families): + """Tests that the IdenticalRegions object is initialized correctly with valid inputs. + """ + is_contig_border = True + identical_regions = IdenticalRegions("IdenticalRegions", identical_rgps, families, is_contig_border) + + assert identical_regions.name == "IdenticalRegions" + assert identical_regions.rgps == identical_rgps + assert identical_regions.families == families + assert identical_regions.is_contig_border == is_contig_border + + @pytest.mark.parametrize("wrong_type", + ["string", + 1, + 0.8, + list(), + dict()]) + def test_init_with_identical_rgps_not_isintance_set(self, wrong_type, families): + """Tests that the IdenticalRegions object cannot be initialized with a not instance set for identical_rgps. + """ + with pytest.raises(TypeError): + IdenticalRegions("IdenticalRegions", wrong_type, families, True) + + def test_init_with_rgp_is_not_instance_region_in_identical_rgps(self, identical_rgps, families): + """Tests that the IdenticalRegions object raise TypeError if one element is not instance Region. + """ + with pytest.raises(TypeError): + IdenticalRegions("IdenticalRegions", identical_rgps.union({1}), families, True) + + def test_init_with_empty_identical_rgps(self, families): + """Tests that the IdenticalRegions object cannot be initialized with an empty set of identical regions. + """ + with pytest.raises(ValueError): + IdenticalRegions("IdenticalRegions", set(), families, True) + + @pytest.mark.parametrize("wrong_type", + ["string", + 1, + 0.8, + list(), + dict()]) + def test_init_with_families_not_isintance_set(self, wrong_type, identical_rgps): + """Tests that the IdenticalRegions object cannot be initialized with a not instance set. + """ + with pytest.raises(TypeError): + IdenticalRegions("IdenticalRegions", identical_rgps, wrong_type, True) + + def test_init_with_family_is_not_instance_genefamilies_in_families(self, identical_rgps, families): + """Tests that the IdenticalRegions object raise TypeError if one element is not instance Region. + """ + with pytest.raises(TypeError): + IdenticalRegions("IdenticalRegions", identical_rgps, families.union({1}), True) + + def test_init_with_empty_families(self, identical_rgps): + """Tests that the IdenticalRegions object cannot be initialized with an empty set of identical regions. + """ + with pytest.raises(ValueError): + IdenticalRegions("IdenticalRegions", identical_rgps, set(), True) + + def test_eq_with_equal_identical_regions(self): + """Tests that the __eq__ method returns Trueè when comparing two IdenticalRegions objects that have the same families, + identical regions, and contig border status. + """ + rgp1 = Region("RGP1") + rgp2 = Region("RGP2") + family1 = GeneFamily(1, "Family1") + family2 = GeneFamily(2, "Family2") + identical_rgps1 = {rgp1, rgp2} + identical_rgps2 = {rgp1, rgp2} + families1 = {family1, family2} + families2 = {family1, family2} + is_contig_border = True + + identical_regions1 = IdenticalRegions("IdenticalRegions", identical_rgps1, families1, is_contig_border) + identical_regions2 = IdenticalRegions("IdenticalRegions", identical_rgps2, families2, is_contig_border) + + assert identical_regions1 == identical_regions2 + + def test_eq_with_non_identical_regions(self): + """Tests that the __eq__ method returns False when comparing + two IdenticalRegions objects that have different families. + """ + rgp1 = Region("RGP1") + rgp2 = Region("RGP2") + family1 = GeneFamily(1, "Family1") + family2 = GeneFamily(2, "Family2") + identical_rgps1 = {rgp1, rgp2} + identical_rgps2 = {rgp1, rgp2} + families1 = {family1, family2} + families2 = {family1} + is_contig_border = True + + identical_regions1 = IdenticalRegions("IdenticalRegions", identical_rgps1, families1, is_contig_border) + identical_regions2 = IdenticalRegions("IdenticalRegions", identical_rgps2, families2, is_contig_border) + + assert identical_regions1 != identical_regions2 + + +def test_compute_grr(): + """Tests that compute_grr returns the correct value when there is a non-zero intersection between families + """ + set1 = {1, 2, 3, 4, 5} + set2 = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10} + + assert rgp_cluster.compute_grr(set1, set2, min) == 1.0 + assert rgp_cluster.compute_grr(set1, set2, max) == 0.5 + + +def test_dereplicate_rgp(identical_rgps): + list_identical_rgps = list(identical_rgps) + rgp1 = list_identical_rgps[0] + assert rgp_cluster.dereplicate_rgp({rgp1}) == [rgp1] + + identical_region_obj = rgp_cluster.IdenticalRegions(name="identical_rgps_0", + identical_rgps=identical_rgps, + families=set(list_identical_rgps[0].families), + is_contig_border=True) + assert rgp_cluster.dereplicate_rgp(rgps=identical_rgps)[0] == identical_region_obj + + +def test_compute_rgp_metric(genes, families): + RGP_a = Region("A") + RGP_b = Region("B") + list_genes = sorted(genes, key=lambda x: x.position) + + for g in list_genes[:8]: + RGP_a[g.position] = g + for g in list_genes[3:7]: + RGP_b[g.position] = g + + assert RGP_a.is_contig_border + assert not RGP_b.is_contig_border + + shared_families = len(set(RGP_a.families).intersection(set(RGP_b.families))) + expected_grr = (RGP_a.ID, RGP_b.ID, {'incomplete_aware_grr': shared_families / min(len(set(RGP_a.families)), len(set(RGP_b.families))), + "min_grr": shared_families / min(len(set(RGP_a.families)), len(set(RGP_b.families))), + 'max_grr': shared_families / max(len(set(RGP_a.families)), len(set(RGP_b.families))), + 'shared_family': shared_families}) + # min_grr + min_result = rgp_cluster.compute_rgp_metric(RGP_a, RGP_b, 0, "min_grr") + assert min_result == expected_grr + + # incomplete_aware_grr: same as min grr as rgp1 is incomplete + incomplete_aware_result = rgp_cluster.compute_rgp_metric(RGP_a, RGP_b, 0, "incomplete_aware_grr") + + assert incomplete_aware_result == expected_grr + + # max grr is below cutoff so None is returned + assert rgp_cluster.compute_rgp_metric(RGP_a, RGP_b, 1000, "max_grr") is None diff --git a/tests/test_Edge.py b/tests/test_Edge.py deleted file mode 100644 index 6a5bf715..00000000 --- a/tests/test_Edge.py +++ /dev/null @@ -1,134 +0,0 @@ -#! /usr/bin/env python3 - -import pytest - -from ppanggolin.genome import Gene -from ppanggolin.edge import Edge -from ppanggolin.geneFamily import GeneFamily - - -def test_cstr_error(): - o_src = Gene('source') - o_tgt = Gene('target') - # genes should have a family - with pytest.raises(Exception): - _ = Edge(o_src, o_tgt) - - o_family = GeneFamily(None, None) - o_family.add_gene(o_src) - # both genes sould have a family - with pytest.raises(Exception): - _ = Edge(o_src, o_tgt) - - # gene should belong to the same organism - o_family.add_gene(o_tgt) - o_src.fill_parents("", None) - o_tgt.fill_parents(None, None) - with pytest.raises(Exception): - _ = Edge(o_src, o_tgt) - - -def test_cstr(): - o_src = Gene('source') - o_tgt = Gene('target') - - # set organism and contig to None. - o_src.fill_parents(None, None) - o_tgt.fill_parents(None, None) - - # define the None GeneFamily, and add the 2 genes to it. - o_family = GeneFamily(None, None) - o_family.add_gene(o_src) - o_family.add_gene(o_tgt) - - o_edge = Edge(o_src, o_tgt) - assert isinstance(o_edge, Edge) - - assert o_edge.source == o_src.family - assert o_edge.target == o_tgt.family - assert dict(o_edge.organisms) == {None: [(o_src, o_tgt)]} - - -@pytest.fixture() -def make_gene_pair(): - def _make_gene_pair(org, gene_id1, gene_id2): - """create 2 genes from org. - each gene belong to its own family.""" - lo_genes = [] - for k in gene_id1, gene_id2: - o_gene = Gene(k) - o_gene.fill_parents(org, None) - - lo_genes.append(o_gene) - - o_family = GeneFamily(k, k) - o_family.add_gene(o_gene) - - return tuple(lo_genes) - - return _make_gene_pair - - -@pytest.fixture() -def o_edge(make_gene_pair): - p = make_gene_pair("org", "src", "tgt") - return Edge(*p) - - -def test_add_enes(make_gene_pair): - p1 = make_gene_pair("org1", "s1", "t1") - p2 = make_gene_pair("org1", "s2", "t1") - p3 = make_gene_pair("org2", "s1", "t2") - p4 = make_gene_pair("org2", "s1", "s2") - # org1: s1,s2 -- t1 - # org2: s1 -- t2,s2 - - o_edge = Edge(*p1) - o_edge.add_genes(*p2) - o_edge.add_genes(*p3) - o_edge.add_genes(*p4) - assert set(o_edge.organisms.keys()) == {"org1", "org2"} - assert o_edge.organisms["org1"] == [p1, p2] - assert o_edge.organisms["org2"] == [p3, p4] - - -@pytest.fixture() -def filled_edge(make_gene_pair): - # Note that the same edge here links 4 families. - p1 = make_gene_pair("org1", "s1", "t1") - p2 = make_gene_pair("org1", "s2", "t1") - p3 = make_gene_pair("org2", "s1", "t2") - p4 = make_gene_pair("org2", "s1", "s2") - # org1: s1,s2 -- t1 - # org2: s1 -- t2,s2 - - o_edge = Edge(*p1) - o_edge.add_genes(*p2) - o_edge.add_genes(*p3) - o_edge.add_genes(*p4) - - return o_edge - - -def test_get_org_dict(o_edge, filled_edge): - assert o_edge.get_org_dict() == o_edge.organisms - assert filled_edge.get_org_dict() == filled_edge.organisms - - -def test_gene_pairs(make_gene_pair): - # cannot use filled_edge because I need access to pair. - p1 = make_gene_pair("org1", "s1", "t1") - p2 = make_gene_pair("org1", "s2", "t1") - p3 = make_gene_pair("org2", "s1", "t2") - p4 = make_gene_pair("org2", "s1", "s2") - # org1: s1,s2 -- t1 - # org2: s1 -- t2,s2 - - o_edge = Edge(*p1) - o_edge.add_genes(*p2) - o_edge.add_genes(*p3) - o_edge.add_genes(*p4) - - # 'set' because the order is not guaranted due to '.values()'. - l_pairs = o_edge.gene_pairs - assert set(l_pairs) == {p1, p2, p3, p4} diff --git a/tests/test_GeneFamily.py b/tests/test_GeneFamily.py deleted file mode 100644 index 53023fe4..00000000 --- a/tests/test_GeneFamily.py +++ /dev/null @@ -1,266 +0,0 @@ -#! /usr/bin/env python3 - -import pytest -from random import randint, sample - -from collections import defaultdict - -from ppanggolin.pangenome import Edge -from ppanggolin.geneFamily import GeneFamily -from ppanggolin.genome import Gene - - -def test_cstr(): - identifier = 33 - name = "33" - o_family = GeneFamily(identifier, name) - assert isinstance(o_family, GeneFamily) - - for attr in "ID", "name", "genes", \ - "removed", "sequence", "partition": - assert hasattr(o_family, attr) - assert o_family.ID == identifier - assert o_family.name == name - assert o_family.genes == set() - assert o_family.removed is False - assert o_family.sequence == "" - assert o_family.partition == "" - - -@pytest.fixture() -def o_family(): - return GeneFamily(33, "trente-trois") - - -def test_add_sequence(o_family): - seq = "un de troa" - o_family.add_sequence(seq) - assert o_family.sequence == seq - - -def test_add_partition(o_family): - partition = "un de troa" - o_family.add_partition(partition) - assert o_family.partition == partition - - -def test_named_partition_error(o_family): - with pytest.raises(Exception): - o_family.named_partition - - -@pytest.mark.parametrize("partition, name", - [ - ("P", "persistent"), - ("Pp", "persistent"), - ("P whatever, only first letter is important", "persistent"), - ("C", "cloud"), - ("C loud", "cloud"), - ("C whatever, only first letter is important", "cloud"), - ("S", "shell"), - ("Shut", "shell"), - ("S whatever, only first letter is important", "shell"), - ("un de troa kvar", "undefined"), - ("1", "undefined"), - ("p", "undefined"), - ("c", "undefined"), - ("s", "undefined"), - ]) -def test_named_partition(o_family, partition, name): - o_family.add_partition(partition) - assert o_family.named_partition == name - - -@pytest.fixture() -def lo_genes(): - return [Gene(str(i)) for i in range(4)] - - -def test_add_gene_error(o_family, lo_genes): - with pytest.raises(TypeError): - o_family.add_gene(33) - - -def test_add_gene_solo(o_family, lo_genes): - o_gene = Gene(33) - o_family.add_gene(o_gene) - assert o_family.genes == {o_gene} - assert o_gene.family == o_family - - -def test_add_gene_many(o_family, lo_genes): - """ fill the family with genes from the same organism""" - organism = "organism" - for o_gene in lo_genes * 4: # *4 to assert duplicates are not considered - o_gene.fill_parents(organism, None) - o_family.add_gene(o_gene) - assert o_gene.family == o_family - assert o_family.genes == set(lo_genes) - - -def test_mk_bitarray_no_org(o_family): - # index is meaningless - o_family.mk_bitarray(None) - assert o_family.bitarray == 0 - - -def test_mk_bitarray_with_org(o_family): - organism = "organism" - o_gene = Gene(33) - o_gene.fill_parents(organism, None) - - o_family.add_gene(o_gene) - - for i in 1, 3, 7, 12: - index = {organism: i} - o_family.mk_bitarray(index) - assert o_family.bitarray == 1 << i - - -def test_get_org_dict_error(o_family): - with pytest.raises(AttributeError): - o_family.discard('_genePerOrg') - # I don't get how this can happen - - -def test_get_org_dict_empty(o_family): - dd = o_family.get_org_dict() - assert isinstance(dd, defaultdict) - assert 0 == len(dd) - - -def test_get_org_dict(o_family, lo_genes): - """ in lo_genes, none has organism. - I'll add one, several times, creating several sets.""" - n_orgs = randint(2, 10) - for org in range(n_orgs): - for o_gene in lo_genes: - o_gene.fill_parents(org, None) - o_family.add_gene(o_gene) - - dd = o_family.get_org_dict() - assert n_orgs == len(dd) - for org in dd: - assert dd[org] == set(lo_genes) - - # Note: after integration, genes can be edited - # which leads to inconsistent results. - # here the same genes are refered to 2 orgs. - # IMO this would be user pb as it is insane user behavior. - - -def test_get_genes_per_org_error(o_family): - with pytest.raises(AttributeError): - o_family.discard('_genePerOrg') - # I don't get how this can happen - - -def test_get_genes_per_org_no_gene(o_family): - org = "org" - - s_genes = o_family.get_genes_per_org(org) - assert 0 == len(s_genes) - - -def test_get_genes_per_org(o_family, lo_genes): - org = "org" - for o_gene in lo_genes: - o_gene.fill_parents(org, None) - o_family.add_gene(o_gene) - s_genes = o_family.get_genes_per_org(org) - assert s_genes == set(lo_genes) - - -def test_organisms_error(o_family, lo_genes): - with pytest.raises(AttributeError): - o_family.discard('_genePerOrg') - # I don't get how this can happen - - -def test_organisms_empty(o_family, lo_genes): - assert set() == o_family.organisms - - -def test_organisms(o_family, lo_genes): - l_org = [] - for o_gene in lo_genes: - org = randint(0, 5) - o_gene.fill_parents(org, None) - o_family.add_gene(o_gene) - l_org.append(org) - - assert set(l_org) == o_family.organisms - - -def test_neighbors_empty(o_family): - assert o_family.neighbors == set() - - -@pytest.fixture -def filled_families(): - """ - return a list of families and genes. - there will be between 3 and 10 genes/families. - Each family has only one gene. - """ - lo_genes = [] - lo_fam = [] - - n_families = randint(3, 10) - for fam in range(n_families): - o_gene = Gene(fam) - o_gene.fill_parents(None, None) - - o_family = GeneFamily(fam, fam) - o_family.add_gene(o_gene) - - lo_genes.append(o_gene) - lo_fam.append(o_family) - - return lo_fam, lo_genes - - -def test_neighbors(filled_families): - lo_fam, lo_genes = filled_families - - # get several genes and make an edge - # between them and the first of the list - n_genes = randint(2, len(lo_genes)) - sample_genes = sample(lo_genes, n_genes) - for o_gene in sample_genes: - # it is strange to me to update family attribute from another class. - Edge(lo_genes[0], o_gene) - # we have 0->{*} - - # first gene belong to the first family - # let's get the family neighbors - # set because order is not guaranted - s = set(lo_fam[0].neighbors) - print(s) - assert n_genes == len(s) - - xpected = {g.family for g in sample_genes} - assert xpected == s - - -def test_edges_empty(o_family): - d = o_family.edges - assert 0 == len(d) - - -def test_edges(filled_families): - lo_fam, lo_genes = filled_families - - # get several genes and make an edge - # between them and the first of the list - n_genes = randint(2, len(lo_genes)) - sample_genes = sample(lo_genes, n_genes) - l_edges = [] - for o_gene in sample_genes: - # it is strange to me to update family attribute from another class. - l_edges.append(Edge(lo_genes[0], o_gene)) - # we have 0->{*} - - edge_list = lo_fam[0].edges - # set because order is not guaranted - assert set(l_edges) == set(edge_list) diff --git a/tests/test_Pangenome.py b/tests/test_Pangenome.py deleted file mode 100644 index eb0856d0..00000000 --- a/tests/test_Pangenome.py +++ /dev/null @@ -1,360 +0,0 @@ -#! /usr/bin/env python3 - -import pytest -from random import choices, randint, sample - -from ppanggolin.genome import Gene, Organism -from ppanggolin.pangenome import Edge, Pangenome -from ppanggolin.geneFamily import GeneFamily - - -def test_cstr(): - o_pang = Pangenome() - assert isinstance(o_pang, Pangenome) - - for attr in "max_fam_id", "parameters", "status": - assert hasattr(o_pang, attr) - assert o_pang.max_fam_id == 0 - assert o_pang.parameters == {} - assert o_pang.status == {'genomesAnnotated': "No", - 'geneSequences': "No", - 'genesClustered': "No", - 'defragmented': "No", - 'geneFamilySequences': "No", - 'neighborsGraph': "No", - 'partitioned': "No", - 'predictedRGP': "No", - 'spots': "No", - 'modules': 'No', - "metadata": {"families": 'No', - "genes": 'No', - "genomes": 'No', - "RGPs": 'No', - "spots": 'No', - "modules": 'No'}, - "metasources": {"families": [], - "genes": [], - "genomes": [], - "RGPs": [], - "spots": [], - "modules": []} - } - - -@pytest.fixture -def o_pang(): - return Pangenome() - - -# @pytest.mark.xfail(reason="not implemented !") -# def test_add_file(o_pang): -# assert False # need to generate a valid file several time - -@pytest.fixture -def l_orgs(): - l_orgs = [] - for i_org in range(randint(5, 20)): - o_org = Organism(str(i_org)) - l_orgs.append(o_org) - - return l_orgs - - -def test_organisms(o_pang, l_orgs): - # 'set' because order is not guaranted - # and org should be unique - assert set(o_pang.organisms) == set() - - # add Org from Org - for o_org in l_orgs: - o_pang.add_organism(o_org) - - # add Org from string - for i_org in range(randint(5, 20)): - o_org = o_pang.add_organism(str(i_org)) - l_orgs.append(o_org) - - assert set(o_pang.organisms) == set(l_orgs) - - -def test_add_organism_str(o_pang): - o_org = o_pang.add_organism("org1") - assert o_org in o_pang.organisms - assert isinstance(o_org, Organism) - assert set(o_pang.organisms) == {o_org} - - -def test_add_organism(o_pang): - o_org = Organism("org") - assert o_pang.add_organism(o_org) == o_org - assert set(o_pang.organisms) == {o_org} - - -def test_number_of_organism(o_pang, l_orgs): - assert o_pang.number_of_organisms() == 0 - - for o_org in l_orgs: - o_pang.add_organism(o_org) - - assert o_pang.number_of_organisms() == len(l_orgs) - - -def test_add_gene_family_one(o_pang): - name = "fam1" - o_fam1 = o_pang.add_gene_family(name) - assert isinstance(o_fam1, GeneFamily) - assert 1 == o_pang.max_fam_id - - -def test_add_gene_family_same(o_pang): - name = "fam1" - o_fam1 = o_pang.add_gene_family(name) - o_fam2 = o_pang.add_gene_family(name) - assert o_fam1 == o_fam2 - - -def test_add_gene_family_many(o_pang): - n_fams = randint(5, 20) - for i_fam in range(n_fams): - o_pang.add_gene_family(str(i_fam)) - assert n_fams == o_pang.max_fam_id - - -def test_get_gene_family(o_pang): - name = "fam1" - o_fam = o_pang.add_gene_family(name) - assert o_pang.get_gene_family(name) == o_fam - - for i_fam in range(randint(5, 20)): - o_pang.add_gene_family(str(i_fam)) - # still true after many insert - assert o_pang.get_gene_family(name) == o_fam - - -def test_number_of_gene_families_empty(o_pang): - assert o_pang.number_of_gene_families() == 0 - - -def test_number_of_gene_families(o_pang): - n_fams = randint(5, 10) - - for i_fam in sample(range(20), k=n_fams): - o_pang.add_gene_family(str(i_fam)) - assert o_pang.number_of_gene_families() == n_fams - - -def test_gene_families_empty(o_pang): - # 'set' because order is not guaranted - assert set(o_pang.gene_families) == set() - - -def test_gene_families(o_pang): - l_ints = choices(range(20), k=10) - s_fams = set() - for i_fam in l_ints: - o_fam = o_pang.add_gene_family(str(i_fam)) - s_fams.add(o_fam) - - assert set(o_pang.gene_families) == s_fams - - -def test_genes_empty(o_pang): - assert list(o_pang.genes) == [] - - -# code copy-pasted from test_Edge.py -@pytest.fixture() -def make_gene_pair(): - def _make_gene_pair(org, gene_id1, gene_id2): - """create a pair of genes that belong to the same organism.""" - lo_genes = [] - for k in gene_id1, gene_id2: - o_gene = Gene(k) - o_gene.fill_parents(org, None) - - lo_genes.append(o_gene) - - o_family = GeneFamily(k, k) - o_family.add_gene(o_gene) - return tuple(lo_genes) - - return _make_gene_pair - - -@pytest.fixture() -def make_org_with_genes(): - def _make_org_with_genes(org): - """make an organism, add from 2 to 10 contigs - with 2 to 10 genes each.""" - l_genes = [] - o_org = Organism(org) - for i in range(randint(2, 10)): - o_ctg = o_org.get_contig("k_{}".format(i)) - for j in range(randint(2, 10)): - name = "{}.{}.{}".format(org, o_ctg.name, j) - o_gene = Gene(name) - o_gene.position = j - o_gene.start = j - o_ctg.add_gene(o_gene) - l_genes.append(o_gene) - return o_org, l_genes - - return _make_org_with_genes - - -@pytest.fixture() -def fill_fam_with_genes(): - def _fill_fam_with_genes(o_fam): - """add genes with names from 2 to 10 to a geneFamily object.""" - l_genes = [] - for i in range(2, 10): - name = "{}_{}".format(o_fam.name, i) - o_gene = Gene(name) - o_fam.add_gene(o_gene) - l_genes.append(o_gene) - return l_genes - - return _fill_fam_with_genes - - -def test_genes_organism_debug(o_pang, make_org_with_genes): - # orgs with genes. - o_org, l_genes = make_org_with_genes("org1") - o_pang.add_organism(o_org) - l_expected = sorted(l_genes, key=lambda g: g.ID) - l_observed = sorted(o_pang.genes, key=lambda g: g.ID) - assert l_observed == l_expected - - -def test_genes_genefamilies(o_pang, fill_fam_with_genes): - """Genes are added in pangenome through their family.""" - # geneFamily with genes. - o_fam = o_pang.add_gene_family("fam1") - l_genes = fill_fam_with_genes(o_fam) # the list of genes, and the geneFam are supposed to be the same - l_expected = sorted(l_genes, key=lambda g: g.ID) - l_observed = sorted(o_pang.genes, key=lambda g: g.ID) - print(o_pang.genes) - assert l_observed == l_expected - - -def test_edges_empty(o_pang): - assert list(o_pang.edges) == [] - - -def test_add_edge(o_pang, make_gene_pair): - name = "gene_fam" # gene/fam name - to_genes = make_gene_pair("org", name, name) - - o_edge1 = o_pang.add_edge(*to_genes) - assert isinstance(o_edge1, Edge) - - # addEdge doesn't act the same when the edge already exists. - o_edge2 = o_pang.add_edge(*to_genes) - assert o_edge2 == o_edge1 - - -def test_edges_one(o_pang, make_gene_pair): - name = "gene_fam" # gene/fam name - to_genes = make_gene_pair("org", name, name) - - lo_edges = [] - n = randint(1, 5) - for _ in range(n): - lo_edges.append(o_pang.add_edge(*to_genes)) - - # always the same family couple - # = one edge, with several couple of genes - # I use set because edges are uniques, it is not a multigraph. - assert set(o_pang.edges) == set(lo_edges) - assert len(o_pang.edges) == 1 - - o_edge = list(o_pang.edges).pop() - assert o_edge.gene_pairs == [to_genes for _ in range(n)] - - -def test_edges_many_rand(o_pang, make_gene_pair): - lo_edges = [] - n = randint(1, 5) - for i in range(n): - name1 = "gene_" + str(i) # gene/fam name - name2 = str(i) + "_gene" # gene/fam name - to_genes = make_gene_pair("org", name1, name2) - lo_edges.append(o_pang.add_edge(*to_genes)) - # I use set because edges are uniques, it is not a supergraph. - assert set(o_pang.edges) == set(lo_edges) - - -def test_edges_several(o_pang, make_gene_pair): - # little more sophisticated - to_genes = make_gene_pair("org", "g1", "g2") - o_fam2 = to_genes[1].family - o_pang.add_edge(*to_genes) - - to_genes = make_gene_pair("org", "g1", "g3") - o_fam3 = to_genes[1].family - o_pang.add_edge(*to_genes) - # g3 -- g1 -- g2 - - to_genes = make_gene_pair("org", "g22", "g33") - o_fam2.add_gene(to_genes[0]) - o_fam3.add_gene(to_genes[1]) - o_pang.add_edge(*to_genes) - # g2 -- g3 - - assert len(o_pang.edges) == 3 - - -def test_get_index(o_pang, l_orgs): - for o_org in l_orgs: - o_pang.add_organism(o_org) - idx = o_pang.get_org_index() - - # after the method, the index exist - assert o_pang.get_org_index() is idx - - # all orgs are in the index - l_observed = sorted(idx.keys(), key=lambda x: x.name) - l_orgs.sort(key=lambda x: x.name) - assert l_observed == l_orgs - - -def test_compute_family_bitarrays(o_pang, l_orgs): - for o_org in l_orgs: - o_pang.add_organism(o_org) - idx = o_pang.get_org_index() - assert o_pang.compute_family_bitarrays() is idx - - -def test_family_have_bitarrays(o_pang, l_orgs): - """test that after the method all the families have a bitarray.""" - n_fams = randint(5, 10) - - l_fams = [] - for i_fam in sample(range(20), k=n_fams): - l_fams.append(o_pang.add_gene_family(str(i_fam))) - o_pang.compute_family_bitarrays() - for o_fam in l_fams: - assert hasattr(o_fam, 'bitarray') - - -def test_get_gene_empty(o_pang): - with pytest.raises(KeyError): - o_pang.get_gene(33) - - -def test_get_gene_org(o_pang, make_org_with_genes): - # orgs with genes. - o_org, l_genes = make_org_with_genes("org") - o_pang.add_organism(o_org) - - n = len(l_genes) - for o_gene in sample(l_genes, randint(4, n)): - assert o_pang.get_gene(o_gene.ID) == o_gene - - -def test_get_gene_fam(o_pang, fill_fam_with_genes): - o_fam = o_pang.add_gene_family("fam") - l_genes = fill_fam_with_genes(o_fam) - - for o_gene in l_genes: - assert o_pang.get_gene(o_gene.ID) == o_gene diff --git a/tests/test_edge.py b/tests/test_edge.py new file mode 100644 index 00000000..147cd418 --- /dev/null +++ b/tests/test_edge.py @@ -0,0 +1,131 @@ +#! /usr/bin/env python3 +# coding: utf8 + +import pytest +from typing import Generator, Tuple + +from ppanggolin.genome import Gene, Organism +from ppanggolin.edge import Edge +from ppanggolin.geneFamily import GeneFamily + + +class TestEdge: + @pytest.fixture + def organism(self) -> Generator[Organism, None, None]: + """Generate a basic organism object + """ + yield Organism("organism") + + @pytest.fixture + def families_pair(self) -> Generator[Tuple[GeneFamily, GeneFamily], None, None]: + """Generate a families pair + """ + yield GeneFamily(1, "family1"), GeneFamily(2, "family2") + + @pytest.fixture + def genes_pair(self, organism, families_pair) -> Generator[Tuple[Gene, Gene], None, None]: + """Generate genes_pair + """ + gene1, gene2 = Gene("gene1"), Gene("gene2") + gene1.fill_parents(organism, None) + gene2.fill_parents(organism, None) + gene1.family, gene2.family = GeneFamily(1, "family1"), GeneFamily(2, "family2") + yield gene1, gene2 + + @pytest.fixture + def edge(self, genes_pair): + """Generate a basic edge + """ + edge = Edge(*genes_pair) + yield edge + + def test_constructor(self, genes_pair, organism, families_pair): + """Tests that an Edge object can be created with two genes belonging to different families + """ + gene1, gene2 = genes_pair + edge = Edge(gene1, gene2) + assert edge.source == gene1.family + assert edge.target == gene2.family + assert edge.source._edges[edge.target] == edge + assert edge.target._edges[edge.source] == edge + assert edge._organisms == {organism: [(gene1, gene2)]} + + def test_constructor_attribute_error(self): + """ + Tests that an AttributeError is raised when creating an Edge object + with a gene that does not belong to any family + """ + gene1 = Gene('gene1') + gene1.family = GeneFamily(0, 'test') + gene2 = Gene('gene2') + with pytest.raises(AttributeError): + # Test target attribute error + Edge(gene1, gene2) + with pytest.raises(AttributeError): + # Test source attribute error + Edge(gene2, gene1) + + def test_gene_pairs(self, edge, genes_pair): + """Tests that gene pairs' generator return what's expected + """ + assert set(edge.gene_pairs) == {genes_pair} + + def test_get_organisms(self, edge, organism): + """Tests that organism generator return what's expected + """ + assert set(edge.organisms) == {organism} + + def test_get_number_of_organisms(self, edge): + """Tests that the good number of organism is returned + """ + assert isinstance(edge.number_of_organisms, int) + assert edge.number_of_organisms == 1 + + def test_get_organisms_dict(self, edge, organism, genes_pair): + """Tests that organism-gene_pairs dict is built as expected + """ + assert edge.get_organisms_dict() == {organism: [genes_pair]} + + def test_get_organism_genes_pairs(self, edge, organism, genes_pair): + """Tests that the gene pairs corresponding to the organism is returned + """ + assert edge.get_organism_genes_pairs(organism) == [genes_pair] + + def test_edge_add_genes_same_organism(self, edge, genes_pair, organism): + """Tests that genes can be added to the edge that are on the same organism + """ + gene1, gene2, gene3, gene4 = *genes_pair, Gene('gene3'), Gene('gene4') + gene3.fill_parents(organism, None) + gene4.fill_parents(organism, None) + edge.add_genes(gene3, gene4) + assert edge.get_organism_genes_pairs(organism) == [(gene1, gene2), (gene3, gene4)] + + def test_edge_add_genes_different_organisms(self, edge, organism): + """Tests that an Exception is raised when adding genes to the edge that are not on the same organism + """ + gene1, gene2 = Gene('gene3'), Gene('gene4') + gene1.fill_parents(organism, None) + org = Organism("org") + gene2.fill_parents(org, None) + with pytest.raises(Exception): + edge.add_genes(gene1, gene2) + + def test_edge_add_genes_one_none_gene(self, edge, organism): + """Tests that a TypeError is raised when adding genes to the edge where one gene is None + """ + gene1 = Gene('gene1') + gene1.fill_parents(organism) + with pytest.raises(TypeError): + edge.add_genes(gene1, None) + with pytest.raises(TypeError): + edge.add_genes(None, gene1) + + def test_edge_add_genes_without_organisms(self, edge, organism): + """Tests that a ValueError is raised when adding genes not filled with organism + """ + gene1, gene2 = Gene('gene1'), Gene('gene2') + gene1.fill_parents(organism, None) + with pytest.raises(ValueError): + edge.add_genes(gene1, gene2) + with pytest.raises(ValueError): + edge.add_genes(gene2, gene1) diff --git a/tests/test_genefamily.py b/tests/test_genefamily.py new file mode 100644 index 00000000..9baddb00 --- /dev/null +++ b/tests/test_genefamily.py @@ -0,0 +1,310 @@ +#! /usr/bin/env python3 +# coding: utf8 + +import pytest +from random import randint +from typing import Generator, Set +from itertools import combinations_with_replacement + +from ppanggolin.pangenome import Edge +from ppanggolin.geneFamily import GeneFamily +from ppanggolin.genome import Gene, Organism, Contig +from ppanggolin.region import Spot, Module + + +class TestGeneFamily: + """Tests the gene family class + """ + @pytest.fixture + def family(self) -> Generator[GeneFamily, None, None]: + """Create a gene family for all tests + """ + yield GeneFamily(1, "test") + + def test_construct_gene_family(self, family): + """Tests that a GeneFamily object can be created with valid family_id and name + """ + assert isinstance(family, GeneFamily) + assert all(attr in ["ID", "name", "_edges", "_genePerOrg", "_genes_getter", "removed", "sequence", "partition", + "_spots", "_modules", "bitarray", "_metadata_getter"] for attr in + family.__dict__) # Check that no attribute was added else it should be tested + assert all(hasattr(family, attr) for attr in ["ID", "name", "_edges", "_genePerOrg", "_genes_getter", "removed", + "sequence", "partition", "_spots", "_modules", + "bitarray"]) # Check that no attribute was removed else it should be tested + assert family.ID == 1 + assert family.name == 'test' + assert family._edges == {} + assert family._genePerOrg == {} + assert family._genes_getter == dict() + assert not family.removed # for the repeated family not added in the main graph + assert family.sequence == "" + assert family.partition == "" + assert family._spots == set() + assert family._modules == set() + assert family.bitarray is None + + @pytest.mark.parametrize("partition, name", + [ + ("P", "persistent"), + ("Pp", "persistent"), + ("P whatever, only first letter is important", "persistent"), + ("C", "cloud"), + ("C loud", "cloud"), + ("C whatever, only first letter is important", "cloud"), + ("S", "shell"), + ("Shut", "shell"), + ("S whatever, only first letter is important", "shell"), + ("un de troa kvar", "undefined"), + ("1", "undefined"), + ("p", "undefined"), + ("c", "undefined"), + ("s", "undefined"), + ]) + def test_get_named_partition_of_gene_family_object(self, family, partition, name): + """Tests that the named partition of a GeneFamily object can be retrieved + """ + family.partition = partition + assert family.named_partition == name + + def test_get_named_partition_error_partition_empty(self, family): + """Tests that if no partition given to gene family, raise a ValueError + """ + with pytest.raises(ValueError): + _ = family.named_partition + + def test_add_sequence_to_gene_family(self, family): + """Tests that a sequence can be added to a GeneFamily object + """ + family.add_sequence('ATCG') + assert family.sequence == 'ATCG' + + def test_add_gene_to_gene_family(self, family): + """Tests that a Gene object can be added to a GeneFamily object + """ + gene = Gene('gene1') + family.add(gene) + assert gene in family.genes + assert gene.family == family + + def test_add_gene_error(self, family): + """Tests that a non-gene object can't be added to a GeneFamily as gene + """ + with pytest.raises(TypeError): + family.add(33) + + @pytest.fixture + def genes(self) -> Generator[Set[Gene], None, None]: + """Creeate a set of genes to fill gene families + """ + genes = set() + for i in range(1, randint(11, 20)): + gene = Gene(f"gene_{str(i)}") + gene.fill_annotations(start=10*(i-1) + 1, stop=10*i, strand='+', position=i, genetic_code=4) + genes.add(gene) + yield genes + + def test_get_number_of_genes(self, family, genes): + """Tests that the number of genes can be retrieved + """ + for gene in genes: + family.add(gene) + assert isinstance(len(family), int) + assert len(family) == len(genes) + + @pytest.fixture + def organisms(self, genes) -> Generator[Set[Organism], None, None]: + """Create a set of organisms fill with genes to test edges + """ + organisms = set() + genes = list(genes) + nb_organisms = randint(2, 10) + nb_genes_per_organisms = len(genes) // nb_organisms + idx_org = 1 + while idx_org < nb_organisms: + organism = Organism(f"organism_{idx_org}") + contig = Contig(f"contig_{idx_org}") + organism.add(contig) + idx_genes = 0 + while idx_genes < nb_genes_per_organisms: + gene = genes[(idx_org - 1) * nb_genes_per_organisms + idx_genes] + gene.fill_parents(organism, contig) + contig[gene.start] = gene + idx_genes += 1 + organisms.add(organism) + idx_org += 1 + # last family fill with all the gene left + organism = Organism(f"organism_{idx_org}") + contig = Contig(f"contig_{idx_org}") + organism.add(contig) + idx_genes = (idx_org - 1) * nb_genes_per_organisms + while idx_genes < len(genes): + gene = genes[idx_genes] + gene.fill_parents(organism, contig) + contig[gene.start] = gene + idx_genes += 1 + organisms.add(organism) + yield organisms + + def test_get_org_dict(self, family, genes, organisms): + """Tests that all organisms and genes are retrieved as expected + """ + for gene in genes: + family.add(gene) + org_dict = family.get_org_dict() + assert isinstance(org_dict, dict) + assert all(isinstance(org, Organism) for org in org_dict.keys()) + assert all(isinstance(gene, Gene) for gene_set in org_dict.values() for gene in gene_set) + assert set(org_dict.keys()) == organisms + assert set([gene for gene_set in org_dict.values() for gene in gene_set]) == genes + + def test_get_org_dict_with_no_organism_fill_to_genes(self, family, genes): + """Tests that if genes are not fill with organism an AttributeError is returned + """ + for gene in genes: + family.add(gene) + with pytest.raises(AttributeError): + _ = family.get_org_dict() + + def test_organisms(self, family, organisms, genes): + """Tests that all organisms are retrieved as expected + """ + for gene in genes: + family.add(gene) + assert set(family.organisms) == organisms + + def test_number_of_organism(self, family, organisms, genes): + """Tests that the expected number of organisms is found + """ + for gene in genes: + family.add(gene) + assert isinstance(family.number_of_organisms, int) + assert family.number_of_organisms == len(organisms) + + def test_get_genes_per_org(self, family, organisms, genes): + """Tests that for a giver organism, all the genes are retrieved as expected + """ + for gene in genes: + family.add(gene) + for organism in organisms: + assert set(family.get_genes_per_org(organism)) == set(organism.genes) + + def test_get_genes_per_org_if_org_not_in_family(self, family): + """Test that a KeyError is generated if an organism not belonging to the family is given + """ + with pytest.raises(KeyError): + org = Organism("organism") + _ = set(family.get_genes_per_org(org)) + + @pytest.fixture + def families(self, genes) -> Generator[Set[GeneFamily], None, None]: + """Create a set of gene families fill with genes to test edges + """ + families = set() + genes = list(genes) + nb_families = randint(2, 10) + nb_genes_per_family = len(genes) // nb_families + idx_fam = 1 + while idx_fam < nb_families: + family = GeneFamily(idx_fam, f"family_{idx_fam}") + idx_genes = 0 + while idx_genes < nb_genes_per_family: + gene = genes[(idx_fam - 1) * nb_genes_per_family + idx_genes] + family.add(gene) + gene.family = family + idx_genes += 1 + families.add(family) + idx_fam += 1 + # last family fill with all the gene left + family = GeneFamily(idx_fam, f"family_{idx_fam}") + idx_genes = (idx_fam - 1) * nb_genes_per_family + while idx_genes < len(genes): + gene = genes[idx_genes] + family.add(gene) + gene.family = family + idx_genes += 1 + families.add(family) + yield families + + @pytest.fixture + def edges(self, families, genes, organisms) -> Generator[Set[Edge], None, None]: + """Create a set of edges fill with genes and gene families to test edges + """ + edges = {} + pair_genes = filter(lambda x: x[0] != x[1] and x[0].organism == x[1].organism, + combinations_with_replacement(genes, 2)) + for pair in pair_genes: + key = frozenset([pair[0].family, pair[1].family]) + edge = edges.get(key) + if edge is None: + edge = Edge(pair[0], pair[1]) + edges[key] = edge + else: + edge.add_genes(pair[0], pair[1]) + pair[0].family.set_edge(pair[1].family, edge) + pair[1].family.set_edge(pair[0].family, edge) + yield set(edges.values()) + + def test_get_neighbors_of_gene_family(self, families, edges): + """Tests get all the expected neighbor of the family in the graph + """ + for family in families: + assert all(isinstance(neighbor, GeneFamily) for neighbor in family.neighbors) + expected_neighbors = set([edge.source for edge in edges + if edge.target == family]).union(set([edge.target for edge in edges + if edge.source == family])) + assert set(family.neighbors) == expected_neighbors + + def test_get_number_of_neighbors(self, families, edges): + """Tests that the expected number of neighbors is found + """ + for family in families: + expected_neighbors = set([edge.source for edge in edges + if edge.target == family]).union(set([edge.target for edge in edges + if edge.source == family])) + assert isinstance(family.number_of_neighbors, int) + assert family.number_of_neighbors == len(expected_neighbors) + + # Tests that the edges of a GeneFamily object can be retrieved + def test_get_edges_of_gene_family(self, families, edges): + """Tests that all the edges belonging to the family are retrieved + """ + for family in families: + expected_edges = set([edge for edge in edges if edge.source == family or edge.target == family]) + assert all(isinstance(edge, Edge) for edge in family.edges) + assert set(family.edges) == expected_edges + + def test_get_number_of_edges(self, families, edges): + """Tests that the expected number of edges is found + """ + for family in families: + expected_edges = set([edge for edge in edges if edge.source == family or edge.target == family]) + assert isinstance(family.number_of_edges, int) + assert family.number_of_neighbors == len(expected_edges) + + def test_add_spot_to_gene_family(self, family): + """Tests that a Spot object can be added to a GeneFamily object + """ + spot = Spot(1) + family.add_spot(spot) + assert spot in family.spots + + def test_add_non_spot_as_spot_in_family(self, family): + """Tests that a non-spot object cannot be added to Gene Family + """ + with pytest.raises(TypeError): + family.add_spot(323) + + def test_add_module_to_gene_family(self, family): + """Tests that a Module object can be added to a GeneFamily object + """ + module = Module(1) + family.add_module(module) + assert module in family.modules + + def test_add_non_module_as_module_in_family(self, family): + """Tests that a non-module object cannot be added to Gene Family + """ + with pytest.raises(TypeError): + family.add_module(323) + + # TODO test mk_bitarray diff --git a/tests/test_genome.py b/tests/test_genome.py new file mode 100644 index 00000000..4b511b98 --- /dev/null +++ b/tests/test_genome.py @@ -0,0 +1,580 @@ +#! /usr/bin/env python3 +# coding: utf8 + +import pytest +from typing import Generator, Tuple +import gmpy2 + +from ppanggolin.genome import Feature, Gene, RNA, Contig, Organism +from ppanggolin.geneFamily import GeneFamily +from ppanggolin.region import Region + + +class TestFeature: + """Tests Feature class + """ + @pytest.fixture + def feature(self) -> Generator[Feature, None, None]: + """Generate a basic feature for tests + """ + yield Feature('test_id') + + def test_creation(self, feature): + """Tests that 'Feature' is created successfully with the given identifier + """ + assert feature.ID == 'test_id' + assert not feature.is_fragment + assert feature.type == '' + assert feature.start is None + assert feature.stop is None + assert feature.strand is None + assert feature.product is None + assert feature.name is None + assert feature.local_identifier is None + assert feature.organism is None + assert feature.contig is None + assert feature.dna is None + + def test_create_feature_with_identifier_not_instance_string(self): + """Tests that a Feature object cannot be created with a non-string type identifier + """ + with pytest.raises(AssertionError): + Feature(4) + + def test_create_feature_empty_identifier(self): + """Tests that a Feature object cannot be created with an empty identifier + """ + with pytest.raises(ValueError): + Feature('') + + def tests_write_organism(self, feature): + """Tests that write feature return feature name as string + """ + assert str(feature) == "test_id" + + def test_fill_annotations(self, feature): + """Tests that 'fill_annotations' method fills the attributes correctly + """ + feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 'product', 'local_id') + assert feature.start == 1 + assert feature.stop == 10 + assert feature.type == 'gene_type' + assert feature.strand == '+' + assert feature.product == 'product' + assert feature.name == 'name' + assert feature.local_identifier == 'local_id' + + def test_fill_annotations_type_error(self, feature): + """Tests that 'fill_annotations' method raises a TypeError if attribute value is not with the correct type + """ + with pytest.raises(TypeError): + feature.fill_annotations('1', 10, '+', 'gene_type', 'name', 'product', 'local_id') + with pytest.raises(TypeError): + feature.fill_annotations(1, "10", '+', 'gene_type', 'name', 'product', 'local_id') + with pytest.raises(TypeError): + feature.fill_annotations(1, 10, 4, 'gene_type', 'name', 'product', 'local_id') + with pytest.raises(TypeError): + feature.fill_annotations(1, 10, "+", 4, 'name', 'product', 'local_id') + with pytest.raises(TypeError): + feature.fill_annotations(1, 10, '+', 'gene_type', 4, 'product', 'local_id') + with pytest.raises(TypeError): + feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 4, 'local_id') + with pytest.raises(TypeError): + feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 'product', 4) + + def test_fill_annotations_value_error(self, feature): + """Tests that 'fill_annotations' method raises a TypeError if strand is not '+' or '-' + """ + with pytest.raises(ValueError): + feature.fill_annotations(1, 10, '4', 'gene_type', 'name', 'product', 'local_id') + + def test_fill_parents(self, feature): + """Tests that 'fill_parents' method associates the object with the given organism and contig + """ + organism = Organism('org_id') + contig = Contig('contig_name') + feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 'product', 'local_id') + feature.fill_parents(organism, contig) + assert feature.organism == organism + assert feature.contig == contig + + def test_fill_parents_with_organism_or_contig_only(self, feature): + """Tests that Gene can be filled with only an organism or a contig + """ + organism = Organism('org') + contig = Contig("ctg") + feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 'product', 'local_id') + feature.fill_parents(organism=organism) + assert feature.organism == organism + feature.fill_parents(contig=contig) + assert feature.contig == contig + + def test_fill_parents_with_nothing(self, feature): + """Tests that Gene cannot be filled with neither an organism and a contig + """ + with pytest.raises(AssertionError): + feature.fill_parents() + + def test_set_organism(self, feature): + """Tests that organism setter sets organism with the valid type + """ + organism = Organism('organism') + feature.organism = organism + assert feature.organism == organism + + def test_set_organism_not_isinstance_organism(self, feature): + """Tests that organism setter return TypeError if sets organism with the invalid type + """ + with pytest.raises(TypeError): + feature.organism = 4 + + def test_set_contig(self, feature): + """Tests that contig setter sets contig with the valid type + """ + contig = Contig('contig') + feature.contig = contig + assert feature.contig == contig + + def test_set_contig_not_isinstance_contig(self, feature): + """Tests that contig setter return TypeError if sets contig with the invalid type + """ + with pytest.raises(TypeError): + feature.contig = 4 + + def test_add_dna(self, feature): + """Tests that 'add_dna' method adds the DNA sequence to the object successfully + """ + feature.add_sequence('ATCG') + assert feature.dna == 'ATCG' + + def test_add_dna_type_error(self, feature): + """Tests that 'add_dna' method raises a TypeError if the DNA sequence is not a string + """ + with pytest.raises(AssertionError): + feature.add_sequence(123) + + def test_lenght(self, feature): + """Tests len method + """ + feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 'product', 'local_id') + assert isinstance(len(feature), int) + assert len(feature) == 10 + + def test_length_start_or_stop_are_not_known(self): + """Tests that len raises ValueError when start is not known + """ + with pytest.raises(ValueError): + feature = Feature('test') + feature.stop = 10 + len(feature) + with pytest.raises(ValueError): + feature = Feature('test') + feature.start = 1 + len(feature) + + +class TestRNA: + """Tests RNA Class + """ + @pytest.fixture + def rna(self) -> Generator[RNA, None, None]: + """Generate a basic gene for tests + """ + yield RNA('rna') + + def test_create_gene_object(self, rna): + """Tests that a Gene object can be created with a valid gene_id + """ + assert rna.ID == 'rna' + + +class TestGene: + """Tests Gene class + """ + @pytest.fixture + def gene(self) -> Generator[Gene, None, None]: + """Generate a basic gene for tests + """ + yield Gene('gene') + + def test_create_gene_object(self, gene): + """Tests that a Gene object can be created with a valid gene_id + """ + assert gene.ID == 'gene' + assert gene.position is None + assert gene._family is None + assert gene._RGP is None + assert gene.genetic_code is None + assert gene.protein is None + + def test_fill_annotations(self, gene): + """Tests that Gene annotations can be filled with valid parameters + """ + gene.fill_annotations(start=1, stop=10, strand='+', position=10, genetic_code=4) + assert gene.position == 10 + assert gene.genetic_code == 4 + + def test_fill_annotations_type_error(self, gene): + """Tests that Gene annotations cannot be filled with invalid parameters + """ + with pytest.raises(TypeError): + gene.fill_annotations(start=1, stop=10, strand='+', position='10', genetic_code=4) + with pytest.raises(TypeError): + gene.fill_annotations(start=1, stop=10, strand='+', position=10, genetic_code="4") + + def test_add_protein(self, gene): + """Tests that a protein sequence can be added to a Gene object + """ + gene.add_protein('MVKLAVLALALAVLALALALAVLALALAVLALALAVLALALAVLALALAVLALALAVLALALAVLALALA') + assert gene.protein == 'MVKLAVLALALAVLALALALAVLALALAVLALALAVLALALAVLALALAVLALALAVLALALAVLALALA' + + def test_add_protein_non_string(self, gene): + """Tests that a non-string protein sequence cannot be added to a Gene object + """ + with pytest.raises(TypeError): + gene.add_protein(123) + + def test_set_family(self, gene): + """Tests that family setter sets family with the valid type + """ + family = GeneFamily(0, 'family') + gene.family = family + assert gene.family == family + + def test_set_family_not_instance_gene_family(self, gene): + """Tests that family setter return TypeError if sets family is not instance GeneFamily + """ + with pytest.raises(TypeError): + gene.family = 4 + + def test_set_rgp(self, gene): + """Tests that RGP setter sets family with the valid type + """ + region = Region(0) + gene.RGP = region + assert gene.RGP == region + + def test_set_rgp_not_instance_region(self, gene): + """Tests that family setter return TypeError if sets rgp is not instance Region + """ + with pytest.raises(TypeError): + gene.RGP = 4 + + +class TestContig: + """Tests Contig class + """ + @pytest.fixture + def contig(self) -> Generator[Contig, None, None]: + """Generate basic contig for tests + """ + yield Contig("contig") + + @pytest.fixture + def gene(self) -> Generator[Gene, None, None]: + """Generate basic gene for tests + """ + gene = Gene('test_gene') + gene.fill_annotations(start=1, stop=10, strand='+', position=0, genetic_code=4) + yield gene + + @pytest.fixture + def genes(self) -> Generator[Tuple[Gene, Gene, Gene], None, None]: + """Generate three basic genes for tests + """ + gene1 = Gene('test_gene1') + gene1.fill_annotations(start=1, stop=10, strand='+', position=0, genetic_code=4) + gene2 = Gene('test_gene2') + gene2.fill_annotations(start=11, stop=20, strand='+', position=1, genetic_code=4) + gene3 = Gene('test_gene3') + gene3.fill_annotations(start=21, stop=30, strand='+', position=2, genetic_code=4) + yield gene1, gene2, gene3 + + def test_create_contig(self, contig): + """Tests that a contig is correclty created + """ + assert contig.name == "contig" + assert not contig.is_circular + assert contig._rna_getter == set() # Saving the rna annotations. We're not using them in the vast majority of cases. + assert contig._genes_getter == {} + assert contig._genes_position == [] + assert contig._organism is None + + def tests_write_contig(self, contig): + """Tests that write contig return contig name as string + """ + assert str(contig) == "contig" + + def test_add_gene(self, gene, contig): + """Tests that a gene can be added to the contig + """ + contig.add(gene) + assert len(contig._genes_getter) == 1 + assert len(contig._genes_position) == 1 + assert contig._genes_getter[gene.start] == gene + assert contig._genes_position[0] == gene + + def test_add_gene_at_far_position(self, gene, contig): + """Tests that a gene can be added at each position and between position are fill with None + """ + contig.add(gene) + new_gene = Gene("Gene2") + new_gene.fill_annotations(start=50, stop=72, strand='+', position=6, genetic_code=4) + contig.add(new_gene) + assert len(contig._genes_position) == 7 + assert contig._genes_position[1:6] == [None]*5 + + def test_add_gene_not_instance_gene(self, contig): + """Tests that the contig cannot be fill with a non-gene object + """ + with pytest.raises(TypeError): + contig.add(1) + with pytest.raises(TypeError): + contig[1] = '4' + + def test_add_gene_with_start_already_taken(self, contig, gene): + """Tests that the contig cannot be fill with a non-gene object + """ + contig.add(gene) + with pytest.raises(ValueError): + new_gene = Gene('test_gene') + new_gene.fill_annotations(start=1, stop=12, strand='+', position=2, genetic_code=4) + contig.add(new_gene) + + def test_add_gene_without_position(self, contig): + """Test that adding a gene not fill with position raise an AttributeError + """ + with pytest.raises(AttributeError): + gene = Gene('test_gene') + contig.add(gene) + + def test_number_of_genes(self, genes, contig): + """Tests len method + """ + gene1, gene2, gene3 = genes + contig.add(gene1) + contig.add(gene2) + contig.add(gene3) + assert isinstance(contig.number_of_genes, int) + assert contig.number_of_genes == 3 + + def test_get_gene(self, gene, contig): + """Tests that a gene can be retrieved by its position + """ + contig.add(gene) + assert contig[0] == gene + + def test_get_genes(self, genes, contig): + """Tests that a list of genes within a range can be retrieved + """ + gene1, gene2, gene3 = genes + contig.add(gene1) + contig.add(gene2) + contig.add(gene3) + assert set(contig.get_genes(0, 3)) == set(genes) + + def test_get_gene_with_non_integer_index(self, contig): + """Tests that a gene cannot be retrieved with an index that is not an integer + """ + with pytest.raises(TypeError): + _ = contig['a'] + + def test_get_genes_with_non_integer_begin_and_end_positions(self, genes, contig): + """Tests that genes cannot be retrieved with non-integer begin and end positions + """ + gene1, gene2, gene3 = genes + contig.add(gene1) + contig.add(gene2) + contig.add(gene3) + with pytest.raises(TypeError): + contig.get_genes('a', 4) + with pytest.raises(TypeError): + contig.get_genes(5, 'b') + with pytest.raises(TypeError): + contig.get_genes('a', 'b') + + def test_get_genes_with_end_position_lower_than_begin_position(self, genes, contig): + """Tests that genes cannot be retrieved with end position lower than begin position + """ + gene1, gene2, gene3 = genes + contig.add(gene1) + contig.add(gene2) + contig.add(gene3) + with pytest.raises(ValueError): + contig.get_genes(2, 0) + + def test_iterate_over_genes(self, genes, contig): + """Tests that all genes in the contig can be iterated over + """ + gene1, gene2, gene3 = genes + contig.add(gene1) + contig.add(gene2) + contig.add(gene3) + assert list(contig.genes) == sorted([gene1, gene2, gene3], key=lambda x: x.position) + + def test_add_rna(self, contig): + """Tests that an RNA can be added to the contig + """ + rna = RNA('test_rna') + contig.add_rna(rna) + assert list(contig.RNAs) == [rna] + + def test_set_organism(self, contig): + """Tests that an organism can be set to the contig + """ + organism = Organism("organism") + contig.organism = organism + assert contig.organism == organism + + def test_set_organism_with_not_instance_organism(self, contig): + """Tests that the contig cannot be fill with a non-organism object + """ + with pytest.raises(TypeError): + contig.organism = 4 + + +class TestOrganism: + """Tests Contig class + """ + @pytest.fixture + def organism(self) -> Generator[Organism, None, None]: + """Generate a basic organism for test + """ + yield Organism('organism') + + @pytest.fixture + def contig(self) -> Generator[Contig, None, None]: + """Generate a basic contig for test + """ + yield Contig("contig") + + @pytest.fixture + def gene(self) -> Generator[Gene, None, None]: + """Generate a basic gene for test + """ + gene = Gene('test_gene') + gene.fill_annotations(start=1, stop=10, strand='+', position=0, genetic_code=4) + yield gene + + def test_create_organism(self, organism): + """Tests that an Organism instance can be created with a valid name + """ + assert organism.name == 'organism' + assert organism._contigs_getter == {} + assert organism._families is None + assert organism.bitarray is None + + def test_create_organism_empty_name(self): + """Tests that an Organism instance cannot be created with an empty name + """ + with pytest.raises(AssertionError): + Organism('') + + def test_create_organism_with_name_not_string(self): + """Tests that an Organism instance cannot be created with a name not instance string + """ + with pytest.raises(AssertionError): + Organism(4) + + def tests_write_organism(self, organism): + """Tests that write organism return organism name as string + """ + assert str(organism) == "organism" + + def test_add_contig(self, organism, contig): + """Tests that a contig can be added to an Organism instance + """ + organism.add(contig) + assert organism._contigs_getter['contig'] == contig + + def test_add_contig_not_instance_contig(self, organism): + """Tests that a non Contig object cannot be added to an Organism instance + """ + with pytest.raises(AssertionError): + organism.add(4) + + def test_add_contig_existing_name(self, organism, contig): + """Tests that a contig with an existing name cannot be added to an Organism instance + """ + organism.add(contig) + with pytest.raises(KeyError): + organism.add(Contig('contig')) + + def test_get_contig(self, organism, contig): + """Tests that a contig can be retrieved from an Organism instance + """ + organism.add(contig) + assert organism.get('contig') == contig + + def test_get_contig_not_instance_string(self, organism): + """Tests that a non Contig object cannot be added to an Organism instance + """ + with pytest.raises(TypeError): + organism.get(4) + + def test_get_nonexistent_contig(self, organism): + """Tests that a non-existent contig cannot be retrieved from an Organism instance + """ + with pytest.raises(KeyError): + organism.get('contig1') + + def test_number_of_contigs(self, organism): + """Tests that the number of contigs in an organism instance can be retrieved + """ + organism.add(Contig('contig1')) + organism.add(Contig('contig2')) + assert organism.number_of_contigs() == 2 + + def test_get_families(self, organism, contig, gene): + """Tests that gene families in an organism can be retrieved + """ + family = GeneFamily(0, "fam") + family.add(gene) + gene.fill_parents(organism, contig) + organism.add(contig) + contig[gene.start] = gene + assert set(organism.families) == {family} + + def test_number_of_families(self, organism, contig, gene): + """Tests that the number of gene families in an organism instance can be retrieved + """ + family = GeneFamily(0, "fam") + family.add(gene) + gene.fill_parents(organism, contig) + organism.add(contig) + contig.add(gene) + assert organism.number_of_families() == 1 + + def tests_get_genes(self, organism, contig, gene): + """Tests that genes in an organism can be retrieved + """ + gene.fill_parents(organism, contig) + organism.add(contig) + contig.add(gene) + assert set(organism.genes) == {gene} + + def test_number_of_genes(self, organism, contig, gene): + """Tests that the number of genes in an organism instance can be retrieved + """ + gene.fill_parents(organism, contig) + organism.add(contig) + contig.add(gene) + assert organism.number_of_genes() == 1 + + def test_mk_bitarray(self, organism, contig): + """Tests that a bitarray can be created for an Organism instance + """ + fam1 = GeneFamily(1, 'fam1') + fam2 = GeneFamily(2, 'fam2') + gene1 = Gene('gene1') + gene2 = Gene('gene2') + gene1.fill_annotations(start=1, stop=10, strand='+', position=0, genetic_code=4) + gene2.fill_annotations(start=11, stop=19, strand='+', position=1, genetic_code=4) + fam1.add(gene1) + fam2.add(gene2) + contig[gene1.start] = gene1 + contig[gene2.start] = gene2 + organism.add(contig) + index = {fam1: 1, fam2: 2} + organism.mk_bitarray(index) + assert organism.bitarray == gmpy2.xmpz(6) diff --git a/tests/test_metadata.py b/tests/test_metadata.py new file mode 100644 index 00000000..d4bab2e4 --- /dev/null +++ b/tests/test_metadata.py @@ -0,0 +1,149 @@ +#! /usr/bin/env python3 +# coding: utf8 + +import pytest +from random import randint +from typing import Generator, Set + +from ppanggolin.metadata import Metadata, MetaFeatures + + +class TestMetadata: + @pytest.fixture + def metadata(self) -> Generator[Metadata, None, None]: + """Create a simple metadata + """ + yield Metadata("source", attribute1="value1", attribute2=["value2", "value3"]) + + def test_constructor(self, metadata): + """Tests that the Metadata object is created successfully with a valid source and attributes + """ + assert metadata.source == "source" + assert metadata.attribute1 == "value1" + assert metadata.attribute2 == "value2,value3" + + def test_constructor_with_empty_source_name(self): + """Tests that a ValueError is raised when creating a Metadata object with an empty source name + """ + with pytest.raises(ValueError): + Metadata("", attribute="value") + + def test_constructor_with_non_string_source_name(self): + """Tests that a TypeError is raised when creating a Metadata object with a non-string source name + """ + with pytest.raises(TypeError): + Metadata(123, attribute="value") + + def test_constructor_with_no_attributes(self): + """Tests that an Exception is raised when creating a Metadata object with no attributes + """ + with pytest.raises(Exception): + Metadata("source") + + def test_get_existing_attribute_value(self, metadata): + """Tests that the value of an existing attribute is returned correctly + """ + assert metadata.attribute1 == "value1" + + def test_get_non_existing_attribute_value(self, metadata): + """Tests that an AttributeError is raised when getting the value of a non-existing attribute + """ + with pytest.raises(AttributeError): + _ = metadata.non_existing_attribute + + def test_attribute_fields(self, metadata): + """Tests that the 'fields' method returns a list of all the attributes in the Metadata object + """ + assert metadata.fields == ["attribute1", "attribute2"] + + def test_length(self, metadata): + """Tests that the number_of_attribute method returns the correct number of attributes in the Metadata object + """ + assert isinstance(len(metadata), int) + assert len(metadata) == 2 + + +class TestMetaFeatures: + @pytest.fixture + def metadata(self) -> Generator[Set[Metadata], None, None]: + """Create a random number of metadata + + :return: Set of metadata + """ + metadata = set() + for i in range(randint(5, 10)): + metadata.add(Metadata(f"source_{i}", **{f"attr_{j}": j for j in range(randint(1, 5))})) + yield metadata + + @pytest.fixture + def metafeatures(self, metadata) -> Generator[MetaFeatures, None, None]: + """Create a simple metafeature object + + :return: metafeature fill with metadata + """ + metafeatures = MetaFeatures() + for meta in metadata: + metafeatures.add_metadata(meta.source, meta) + yield metafeatures + + def test_add_metadata(self, metafeatures, metadata): + """Tests that metadata can be added to the metadata getter + """ + assert all(metafeatures._metadata_getter[meta.source] == [meta] for meta in metadata) + + def test_get_metadata_feature_corresponding_to_source(self, metafeatures, metadata): + """Tests that all the metadata features corresponding to a source can be retrieved + """ + assert all(metafeatures.get_metadata_by_source(meta.source) == [meta] for meta in metadata) + + def test_remove_source_from_feature(self, metafeatures): + """Tests that a source can be removed from the feature + """ + metadata = Metadata("source_del", attribute1="value") + metafeatures.add_metadata("source_del", metadata) + metafeatures.del_metadata_by_source("source_del") + assert metafeatures.get_metadata_by_source("source_del") is None + + def test_generate_all_metadata_sources(self, metafeatures, metadata): + """Tests that all metadata sources can be generated + """ + assert list(metafeatures.sources) == [meta.source for meta in metadata] + + def test_get_metadata_by_attribute_values(self, metafeatures): + """Tests that metadata can be retrieved based on attribute values + """ + meta = Metadata("source_test", attribute1="value_to_retrieve") + # meta_list = Metadata("source_list", attribute1=["val_1", "val_2"]) + metafeatures.add_metadata(meta.source, meta) + # metafeatures[meta_list.source] = meta_list + assert list(metafeatures.get_metadata_by_attribute(attribute1="value_to_retrieve")) == [meta] + # assert list(metafeatures.get_metadata(attribute1="val_1")) == [meta_list] + + def test_get_maximum_number_of_metadata_for_one_source(self, metafeatures, metadata): + """Tests that the maximum number of metadata for one source can be retrieved + """ + metadata1 = Metadata("source_max", attribute1="value1") + metadata2 = Metadata("source_max", attribute2="value2") + metafeatures.add_metadata("source_max", metadata1) + metafeatures.add_metadata("source_max", metadata2) + assert metafeatures.max_metadata_by_source() == ("source_max", 2) + + def test_metadata_is_not_with_type_metadata(self, metafeatures): + """Tests that an AssertionError is raised when metadata is not with type Metadata + """ + with pytest.raises(AssertionError): + metafeatures.add_metadata("source1", "not_metadata") + + def test_source_is_not_a_string(self, metafeatures): + """Tests that an AssertionError is raised when the source is not a string + """ + + metadata = Metadata("source1", attribute1="value1") + with pytest.raises(AssertionError): + metafeatures.add_metadata(1, metadata) + + def test_source_or_metadata_is_not_with_correct_type(self, metafeatures, metadata): + """Tests that an AssertionError is raised when the source or metadata is not with the correct type + """ + with pytest.raises(AssertionError): + metafeatures.add_metadata(1, "not_metadata") diff --git a/tests/test_pangenome.py b/tests/test_pangenome.py new file mode 100644 index 00000000..6a4d7e72 --- /dev/null +++ b/tests/test_pangenome.py @@ -0,0 +1,891 @@ +#! /usr/bin/env python3 +# coding: utf8 + +import pytest +from random import choices, randint +from typing import Generator, Set, Tuple, Union + +from ppanggolin.genome import Gene, Organism, Contig +from ppanggolin.pangenome import Pangenome +from ppanggolin.edge import Edge +from ppanggolin.geneFamily import GeneFamily +from ppanggolin.region import Region, Spot, Module +from ppanggolin.metadata import Metadata + + +class TestPangenome: + """This class tests methods in pangenome class associated to pangenome direclty. + For pangenome components, there are subclasses to test each component. + This class also generate a pangenome for all the test + """ + + @pytest.fixture + def pangenome(self) -> Generator[Pangenome, None, None]: + """Create a pangenomes object for test + + :return: Generator with the pangenome object + """ + pangenome = Pangenome() + yield pangenome + + def test_cstr(self, pangenome): + """ + Tests the constructor method of the Pangenome class. + It checks that all attributes are present and have the correct type and or value. + + :param pangenome: Test the function + + :return: A pangenome object + """ + pangenome_attr_type = { + "file": type(None), + "_famGetter": dict, + "_org_index": type(None), + "_fam_index": type(None), + "_max_fam_id": int, + "_orgGetter": dict, + "_edgeGetter": dict, + "_regionGetter": dict, + "_spotGetter": dict, + "_moduleGetter": dict, + "status": dict, + "parameters": dict + } + status_keys = [ + 'genomesAnnotated', + 'geneSequences', + 'genesClustered', + 'defragmented', + 'geneFamilySequences', + 'neighborsGraph', + 'partitioned', + 'predictedRGP', + 'spots', + 'modules', + "metadata", + "metasources" + ] + metadata_keys = [ + "families", + "genes", + "genomes", + "RGPs", + "spots", + "modules" + ] + for attr, attr_type in pangenome_attr_type.items(): + assert hasattr(pangenome, attr) + assert isinstance(pangenome.__getattribute__(attr), attr_type) + if attr_type == dict: + if attr == "status": + assert len(pangenome.status) == len(status_keys) + else: + assert len(pangenome.__getattribute__(attr)) == 0 + + for status_key in status_keys: + assert status_key in pangenome.status + if status_key not in ["metadata", "metasources"]: + assert pangenome.status[status_key] == "No" + else: + assert_res = "No" if status_key == "metadata" else [] + for metadata_key in metadata_keys: + assert metadata_key in pangenome.status[status_key] + assert pangenome.status[status_key][metadata_key] == assert_res + assert pangenome.max_fam_id == 0 + + def test_is_instance_pangenome(self, pangenome): + """Tests whether the pangenome object is an instance of the Pangenome class. + This test is important because it ensures that the class name does not change and that we are working + with a Pangenome object, and not some other type of object. + + :param pangenome: Object to test if is an instance of the pangenome class + + :raise AssertionError: If pangenome is not an instance of the pangenome class + """ + assert isinstance(pangenome, Pangenome) + + def test_add_file_is_not_path(self, pangenome): + """Tests that the add_file method raises an AssertionError if a file is not an instance of the Path class + + :param pangenome: Pangenome object to test method + """ + with pytest.raises(AssertionError): + pangenome.add_file("pangenome.h5") + + +class TestPangenomeOrganism(TestPangenome): + """This class tests methods in pangenome class associated to organisms. + """ + + @pytest.fixture + def organism(self) -> Generator[Organism, None, None]: + """Create a basic organism + """ + yield Organism(name="organism") + + def test_add_organism(self, pangenome, organism): + """Tests the add_organism method of the Pangenome class. + + :param pangenome: Pangenome object to test method + :param organism: organism object to test method + """ + pangenome.add_organism(organism) + assert set(pangenome.organisms) == {organism} + + def test_add_organism_already_in_pangenome(self, pangenome, organism): + """Tests that adding organism that already exist return a KeyError. + + :param pangenome: Pangenome object to test method + :param organism: organism object to test method + """ + pangenome.add_organism(organism) + with pytest.raises(KeyError): + pangenome.add_organism(organism) + + def test_add_organism_not_instance_organism(self, pangenome): + """Ensure that it raises an AssertionError when a non-Organism object is passed as an argument. + + :param pangenome: Pangenome object to test method + """ + with pytest.raises(AssertionError): + pangenome.add_organism("org") + + def test_get_organism(self, pangenome, organism): + """Tests the get_organism method of the Pangenome class. + + :param pangenome: Pangenome object to test method + :param organism: organism object to test method + """ + pangenome.add_organism(organism) + get_org = pangenome.get_organism("organism") + assert isinstance(get_org, Organism) + assert organism == get_org + + def test_get_organism_not_in_pangenome(self, pangenome): + """Ensure that it raises a KeyError when an Organism is not in the pangenome. + + :param pangenome: Pangenome object to test method + """ + with pytest.raises(KeyError): + pangenome.get_organism('org') + + def test_get_organism_with_name_not_instance_string(self, pangenome): + """Ensure that it raises an AssertionError when a non-string name is passed as organism name. + + :param pangenome: Pangenome object to test method + """ + with pytest.raises(AssertionError): + pangenome.get_organism(33) + + @pytest.fixture + def organisms(self) -> Generator[Set[Organism], None, None]: + """Create a set of organism object for test + + :return: Generator with the set of organism object + """ + orgs = set() + for i in range(randint(5, 20)): + org = Organism(str(i)) + orgs.add(org) + yield orgs + + @pytest.fixture + def add_organisms(self, pangenome, organisms): + """Add the set of organims to pangenome + + :param pangenome: Pangenome object to test method + :param organisms: Set of organisms to add to pangenome + """ + for org in organisms: + pangenome.add_organism(org) + + def test_number_of_organisms(self, add_organisms, pangenome, organisms): + """Tests the number_of_organisms method of the pangenome class. + + :param add_organisms: Method to add organisms + :param pangenome: Pangenome object to test method + :param organisms: Set of organisms to add to pangenome + """ + assert isinstance(pangenome.number_of_organisms, int) + assert pangenome.number_of_organisms == len(organisms) + + +class TestPangenomeGeneFamilies(TestPangenome): + """This class tests methods in pangenome class associated to gene families. + """ + + @pytest.fixture + def family(self) -> Generator[GeneFamily, None, None]: + """Create a Gene Family object + + :return: Generator with a Gene Family object + """ + family = GeneFamily(0, "family") + yield family + + def test_max_fam_id_is_instance_int_and_egal_zero(self, pangenome): + """Tests that the max_fam_id attribute is corretly set + + :param pangenome: Pangenome object to test method + """ + assert isinstance(pangenome.max_fam_id, int) + assert pangenome.max_fam_id == 0 + + def test_add_gene_family(self, pangenome, family): + """Tests the add_gene_family method of the Pangenome class. + + :param pangenome: Pangenome object to test method + :param family: gene family object to test method + """ + pangenome.add_gene_family(family) + assert 1 == pangenome.max_fam_id + assert set(pangenome.gene_families) == {family} + + def test_add_gene_family_already_in_pangenome(self, pangenome, family): + """Tests that adding gene family that already exist return a KeyError. + + :param pangenome: Pangenome object to test method + :param family: gene family object to test method + """ + pangenome.add_gene_family(family) + with pytest.raises(KeyError): + pangenome.add_gene_family(family) + + def test_get_gene_family(self, pangenome, family): + """Tests that get_gene_family return a gene family object corresponding to the requested gene family + + :param pangenome: Pangenome object to test method + :param family: gene family object to test method + """ + pangenome.add_gene_family(family) + assert isinstance(pangenome.get_gene_family("family"), GeneFamily) + assert pangenome.get_gene_family("family") == family + + def test_get_gene_family_not_in_pangenome(self, pangenome, family): + """Tests that return a KeyError if family does not exist in pangenome + + :param pangenome: Pangenome object to test method + :param family: gene family object to test method + """ + with pytest.raises(KeyError): + pangenome.get_gene_family("fam") + + def test_get_gene_family_with_name_not_isinstance_string(self, pangenome): + """Tests that return an AssertionError if family name used to get family is not string + + :param pangenome: Pangenome object to test method + """ + with pytest.raises(AssertionError): + pangenome.get_gene_family(3) + + @pytest.fixture + def families(self) -> Generator[Set[GeneFamily], None, None]: + """Create a set of Gene Family object for test + + :return: Generator with the set of organism object + """ + families = set() + for i in range(randint(5, 20)): + family = GeneFamily(family_id=i, name=f'family{i}') + families.add(family) + yield families + + @pytest.fixture + def add_families(self, pangenome, families): + """Add the set of gene families to pangenome + + :param pangenome: pangenome object to test method + :param families: set of gene families to add to pangenome + """ + for family in families: + pangenome.add_gene_family(family) + + def test_number_of_gene_families_empty(self, add_families, pangenome, families): + """Tests the number_of_gene_families method of the pangenome class. + + :param add_families: Method to add gene families + :param pangenome: pangenome object to test method + :param families: set of families to add to pangenome + """ + assert isinstance(pangenome.number_of_gene_families, int) + assert pangenome.number_of_gene_families == len(families) + + +class TestPangenomeGene(TestPangenome): + """This class tests methods in pangenome class associated to Gene. + """ + + @pytest.fixture + def genes(self) -> Generator[Set[Gene], None, None]: + """Create a set of Gene object for test + + :return: Generator with the set of organism object + """ + genes = set() + for i in range(randint(5, 20)): + gene = Gene(gene_id=i) + genes.add(gene) + yield genes + + @pytest.fixture(name="organism_genes") + def fill_org_with_genes(self) -> Generator[Union[Organism, Set[Gene]], None, None]: + """Fill an organism with a random set of gene + + :return: Organism with genes + """ + genes = set() + organism = Organism(name="organism") + for contig_id in range(randint(2, 10)): + contig = Contig("k_{}".format(contig_id)) + organism.add(contig) + for gene_idx in range(randint(2, 10)): + gene = Gene(gene_id=f"{organism.name}.{contig_id}.{gene_idx}") + gene.position = gene_idx + gene.start = gene_idx + contig[gene.start] = gene + genes.add(gene) + yield organism, genes + + @pytest.fixture(name="family_genes") + def fill_family_with_genes(self, pangenome): + """Fill a gene family with a random set of gene + + :return: Gene family with genes + """ + genes = set() + family = GeneFamily(family_id=pangenome.max_fam_id, name="family") + for gene_idx in range(randint(2, 10)): + gene = Gene(gene_id=f"{family.name}_{gene_idx}") + gene.position = gene_idx + gene.start = gene_idx + family.add(gene) + genes.add(gene) + yield family, genes + + def test_genes_generator_from_organism(self, pangenome, organism_genes): + """Tests genes generator from organism in the pangenome object + + :param pangenome: Pangenome object + :param organism_genes: method to get an organism object filled with genes + """ + organism, genes = organism_genes + pangenome.add_organism(organism) + assert genes == set(pangenome.genes) + + def test_get_gene_with_organism(self, pangenome, organism_genes): + """Tests get genes from organism in pangenome object + + :param pangenome: Pangenome object + :param organism_genes: Method to get an organism object filled with genes + """ + organism, genes = organism_genes + pangenome.add_organism(organism) + for gene in genes: + assert pangenome.get_gene(gene.ID) == gene + + def test_genes_generator_from_gene_families(self, family_genes, pangenome): + """Tests genes generator from gene families in pangenome object + + :param pangenome: Pangenome object to test method + :param family_genes: method to get a gene family object filled with genes + """ + family, genes = family_genes + pangenome.add_gene_family(family) + assert genes == set(pangenome.genes) + + def test_get_with_gene_family(self, pangenome, family_genes): + """Tests genes generator from gene families in pangenome object + + :param pangenome: Pangenome object to test method + :param family_genes: method to get a gene family object filled with genes + """ + family, genes = family_genes + pangenome.add_gene_family(family) + for gene in genes: + assert pangenome.get_gene(gene.ID) == gene + + def test_get_gene_not_in_pangenome(self, pangenome): + """Tests that return a KeyError if gene does not exist in pangenome + + :param pangenome: Pangenome object to test method + """ + with pytest.raises(KeyError): + pangenome.get_gene("12151405613024") + + def test_get_gene_with_id_not_string(self, pangenome): + """Tests that return an AssertionError if gene identifier is not a string + + :param pangenome: Pangenome object to test method + """ + with pytest.raises(AssertionError): + pangenome.get_gene(gene_id=4) + + def test_number_of_genes(self, pangenome, organism_genes): + """Tests get number of genes in pangenome object + + :param pangenome: pangenome object to test method + :param organism_genes: method to get a organism object fill with genes + """ + organism, genes = organism_genes + pangenome.add_organism(organism) + assert isinstance(pangenome.number_of_genes, int) + assert pangenome.number_of_genes == len(genes) + + def test_get_multigenic(self, pangenome): + # TODO make a better test + """Tests get multigenic genes in pangenome object + + :param pangenome: pangenome object to test method + """ + multigenic = pangenome.get_multigenics(0.5) + assert isinstance(multigenic, set) + + +class TestPangenomeEdge(TestPangenome): + """This class tests methods in pangenome class associated to Edge. + """ + + @staticmethod + def make_gene_pair(gene_id_1: int = 1, gene_id_2: int = 2) -> Tuple[Gene, Gene]: + """Create a pair of genes that belong to the same organism in two different families + + :return: Two genes linked to contigs, organism and gene families + """ + gene1 = Gene(gene_id=f"gene_{gene_id_1}") + gene2 = Gene(gene_id=f"gene_{gene_id_2}") + fam1 = GeneFamily(family_id=1, name=f"fam_{gene_id_1}") + fam2 = GeneFamily(family_id=2, name=f"fam_{gene_id_2}") + ctg1 = Contig(name=f"ctg_{gene_id_1}") + ctg2 = Contig(name=f"ctg_{gene_id_2}") + fam1.add(gene1) + fam2.add(gene2) + organism = Organism(name=f"org_{choices([gene_id_1, gene_id_2], k=1)}") + gene1.fill_parents(organism, ctg1) + gene2.fill_parents(organism, ctg2) + return gene1, gene2 + + @pytest.fixture + def gene_pair(self) -> Generator[Tuple[Gene, Gene], None, None]: + """Call method to create a pair of genes that belong to the same organism in two different families + + :return: Two genes linked to contigs, organism and gene families + """ + yield self.make_gene_pair() + + def test_add_edge(self, pangenome, gene_pair): + """Tests the add_edge method of the Pangenome class. + + :param pangenome: Pangenome object to test method + :param gene_pair: Pair of gene coding for the edge + """ + gene1, gene2 = gene_pair + edge = pangenome.add_edge(gene1, gene2) + assert isinstance(edge, Edge) + assert set(pangenome.edges) == {edge} + + def test_add_edge_already_in_pangenome(self, pangenome, gene_pair): + """Tests that adding the same pair of genes as edge return the edge. + + :param pangenome: Pangenome object to test method + :param gene_pair: Pair of gene coding for the edge + """ + gene1, gene2 = gene_pair + edge = pangenome.add_edge(gene1, gene2) + assert pangenome.add_edge(gene1, gene2) == edge + + def test_add_edge_with_gene_not_isinstance_gene(self, pangenome): + """Tests that return an AssertionError if genes are not Gene objects + + :param pangenome: Pangenome object to test method + """ + with pytest.raises(AssertionError): + pangenome.add_edge("gene1", "gene2") + + def test_number_of_edges(self, pangenome, gene_pair): + """Tests the number_of_edges method of the Pangenome class. + + :param pangenome: Pangenome object to test method + :param gene_pair: Pair of gene coding for the edge + """ + pangenome.add_edge(*gene_pair) + assert isinstance(pangenome.number_of_edges, int) + assert pangenome.number_of_edges == 1 + + +class TestPangenomeBinary(TestPangenomeOrganism, TestPangenomeGeneFamilies): + """This class tests methods in pangenome class associated to binary methods. + """ + + # TODO Better test for this part + def test_get_org_index(self, add_organisms, pangenome): + """Tests the get_org_index function in pangenome class + + :param add_organisms: Add organisms to the pangenome + :param pangenome: Pass the pangenome object + """ + orgs_index = pangenome.get_org_index() + assert isinstance(orgs_index, dict) + index_know = set() + for org, index in orgs_index.items(): + assert isinstance(org, Organism) + assert isinstance(index, int) + assert index not in index_know + index_know.add(index) + + def test_compute_family_bitarrays_with_index_already_computed(self, add_organisms, add_families, pangenome): + """Tests the compute_family_bitarrays function in Pangenome class + + :param add_families: Add families to the pangenome object + :param pangenome: Access the pangenome object + """ + org_idx = pangenome.get_org_index() + assert pangenome.compute_family_bitarrays() == org_idx + + def test_compute_family_bitarrays_without_index_already_computed(self, add_organisms, add_families, pangenome): + """Tests the compute_family_bitarrays function of the Pangenome class. + + :param add_families: Add families to the pangenome + :param pangenome: Test the compute_family_bitarrays function + """ + pangenome.compute_family_bitarrays() + for family in pangenome.gene_families: + assert family.bitarray is not None + + def test_get_fam_index(self, add_families, pangenome): + """Tests the get_org_index function in pangenome class + + :param add_families: Add families to the pangenome + :param pangenome: Pass the pangenome object + """ + fams_index = pangenome.get_fam_index() + assert isinstance(fams_index, dict) + index_know = set() + for fam, index in fams_index.items(): + assert isinstance(fam, GeneFamily) + assert isinstance(index, int) + assert index not in index_know + index_know.add(index) + + def test_compute_org_bitarrays_with_index_already_computed(self, add_organisms, add_families, pangenome): + """Tests the compute_family_bitarrays function in Pangenome class + + :param add_families: Add families to the pangenome object + :param pangenome: Access the pangenome object + """ + fams_index = pangenome.get_fam_index() + assert pangenome.compute_org_bitarrays() == fams_index + + def test_compute_org_bitarrays_without_index_already_computed(self, add_organisms, add_families, pangenome): + """Tests the compute_family_bitarrays function of the Pangenome class. + + :param add_families: Add families to the pangenome + :param pangenome: Test the compute_family_bitarrays function + """ + pangenome.compute_org_bitarrays() + for organism in pangenome.organisms: + assert organism.bitarray is not None + + +class TestPangenomeRGP(TestPangenome): + """This class tests methods in pangenome class associated to Region + """ + + def test_add_region(self, pangenome): + """Tests the add_region method in the Pangenome class. + + :param pangenome: Access the pangenome object + """ + rgp = Region(name="rgp") + pangenome.add_region(rgp) + assert len(pangenome._regionGetter) == 1 + assert pangenome._regionGetter["rgp"] == rgp + + def test_add_region_already_in_pangenome(self, pangenome): + """Tests that adding region already in pangenome return a KeyError. + + :param pangenome: Access the pangenome object + """ + rgp = Region(name="rgp") + pangenome.add_region(rgp) + with pytest.raises(KeyError): + pangenome.add_region(rgp) + + def test_add_region_with_isinstance_not_region(self, pangenome): + """Tests that adding an object with not Region type return an AssertionError. + + :param pangenome: Access the pangenome object + """ + with pytest.raises(AssertionError): + pangenome.add_region("rgp") + + def test_get_region(self, pangenome): + """Tests the get_region method in the Pangenome class. + + :param pangenome: Access the pangenome object + """ + rgp = Region(name="rgp") + pangenome.add_region(rgp) + assert pangenome.get_region("rgp") == rgp + + def test_get_region_not_in_pangenome(self, pangenome): + """Tests get the region not in pangenome return a KeyError. + + :param pangenome: Access the pangenome object + """ + with pytest.raises(KeyError): + pangenome.get_region("rgp") + + def test_get_region_with_isinstance_not_string(self, pangenome): + """Tests that getting a region with not string as identifier return an AssertionError. + + :param pangenome: Access the pangenome object + """ + with pytest.raises(AssertionError): + pangenome.get_region(15646) + + def test_number_of_rgp(self, pangenome): + """Tests the number_of_rgp method in the Pangenome class. + + :param pangenome: Pass the pangenome object to the function + """ + rgp = Region(name="rgp") + pangenome.add_region(rgp) + assert isinstance(pangenome.number_of_rgp, int) + assert pangenome.number_of_rgp == 1 + + +class TestPangenomeSpot(TestPangenome): + """This class tests methods in pangenome class associated to Spot. + """ + + def test_add_spot(self, pangenome): + """Tests the add_spot method in the Pangenome class. + + :param pangenome: Access the pangenome object + """ + spot = Spot(spot_id=0) + pangenome.add_spot(spot) + assert len(pangenome._spotGetter) == 1 + assert pangenome._spotGetter[0] == spot + + def test_add_spot_already_in_pangenome(self, pangenome): + """Tests that adding spot already in pangenome return a KeyError. + + :param pangenome: Access the pangenome object + """ + spot = Spot(spot_id=0) + pangenome.add_spot(spot) + with pytest.raises(KeyError): + pangenome.add_spot(spot) + + def test_add_spot_with_isinstance_not_spot(self, pangenome): + """Tests that adding an object with not Spot type return an AssertionError. + + :param pangenome: Access the pangenome object + """ + with pytest.raises(AssertionError): + pangenome.add_spot(4564) + + def test_get_spot_with_int(self, pangenome): + """Tests get_spot method with integer in pangenome class + + :param pangenome: Access the pangenome object + """ + spot = Spot(spot_id=0) + pangenome.add_spot(spot) + assert pangenome.get_spot(0) == spot + + def test_get_spot_with_str(self, pangenome): + """Tests get_spot method with string in pangenome class + + :param pangenome: Access the pangenome object + """ + spot = Spot(spot_id=0) + pangenome.add_spot(spot) + assert pangenome.get_spot("spot_0") == spot + + def test_get_spot_not_in_pangenome(self, pangenome): + """Tests that getting spot not in pangenome return a KeyError. + + :param pangenome: Access the pangenome object + """ + with pytest.raises(KeyError): + pangenome.get_spot(544654) + + def test_number_of_spots(self, pangenome): + """Tests number_of_spots methods in Pangenome class + + :param pangenome: Access the pangenome object + """ + spot = Spot(spot_id=0) + pangenome.add_spot(spot) + assert isinstance(pangenome.number_of_spots, int) + assert pangenome.number_of_spots == 1 + + +class TestPangenomeModule(TestPangenome): + """This class tests methods in pangenome class associated to Modules. + """ + + def test_add_module(self, pangenome): + """Tests the add_module method in the Pangenome class. + + :param pangenome: Access the pangenome object + """ + module = Module(module_id=0) + pangenome.add_module(module) + assert len(pangenome._moduleGetter) == 1 + assert pangenome._moduleGetter[0] == module + + def test_add_module_already_in_pangenome(self, pangenome): + """Tests that adding module already in pangenome return a KeyError. + + :param pangenome: Access the pangenome object + """ + module = Module(module_id=0) + pangenome.add_module(module) + with pytest.raises(KeyError): + pangenome.add_module(module) + + def test_add_module_with_isinstance_not_region(self, pangenome): + """Tests that adding an object with not Module type return an AssertionError. + + :param pangenome: Access the pangenome object + """ + with pytest.raises(AssertionError): + pangenome.add_module("module") + + def test_get_module_with_int(self, pangenome): + """Tests get_module method with integer in pangenome class + + :param pangenome: Access the pangenome object + """ + module = Module(module_id=0) + pangenome.add_module(module) + assert pangenome.get_module(0) == module + + def test_get_module_with_str(self, pangenome): + """Tests get_module method with string in pangenome class + + :param pangenome: Access the pangenome object + """ + module = Module(module_id=0) + pangenome.add_module(module) + assert pangenome.get_module("module_0") == module + + def test_get_module_not_in_pangenome(self, pangenome): + """Tests that getting module not in pangenome return a KeyError. + + :param pangenome: Access the pangenome object + """ + with pytest.raises(KeyError): + pangenome.get_module(0) + + def test_number_of_modules(self, pangenome): + """Tests number_of_modules methods in Pangenome class + + :param pangenome: Access the pangenome object + """ + module = Module(module_id=0) + pangenome.add_module(module) + assert isinstance(pangenome.number_of_modules, int) + assert pangenome.number_of_modules == 1 + + +class TestPangenomeMetadata(TestPangenome): + """This class tests methods in pangenome class associated to Metadata. + """ + + @pytest.fixture + def add_element_to_pangenome(self, pangenome): + """Adds a metadata element to each element of pangenome + + :param pangenome: Access the pangenome object + """ + metadata = Metadata(source="source", attribute="attr") + family = GeneFamily(family_id=pangenome.max_fam_id, name="Fam") + family.add_metadata(source=metadata.source, metadata=metadata) + pangenome.add_gene_family(family) + org = Organism("Org") + org.add_metadata(source=metadata.source, metadata=metadata) + ctg = Contig("Ctg") + org.add(ctg) + gene = Gene("Gene") + gene.position, gene.start = (0, 0) + gene.add_metadata(source=metadata.source, metadata=metadata) + ctg[gene.start] = gene + pangenome.add_organism(org) + rgp = Region("RGP") + rgp.add_metadata(source=metadata.source, metadata=metadata) + pangenome.add_region(rgp) + spot = Spot(0) + spot.add_metadata(source=metadata.source, metadata=metadata) + pangenome.add_spot(spot) + module = Module(0) + module.add_metadata(source=metadata.source, metadata=metadata) + pangenome.add_module(module) + + def test_select_elem(self, add_element_to_pangenome, pangenome): + """Tests the select_elem method of the Pangenome class. + + :param add_element_to_pangenome: Add elements to the pangenome + :param pangenome: Access the pangenome object + """ + assert all(isinstance(elem, GeneFamily) for elem in set(pangenome.select_elem("families"))) + assert all(isinstance(elem, Organism) for elem in set(pangenome.select_elem("genomes"))) + assert all(isinstance(elem, Gene) for elem in set(pangenome.select_elem("genes"))) + assert all(isinstance(elem, Region) for elem in set(pangenome.select_elem("RGPs"))) + assert all(isinstance(elem, Spot) for elem in set(pangenome.select_elem("spots"))) + assert all(isinstance(elem, Module) for elem in set(pangenome.select_elem("modules"))) + with pytest.raises(KeyError): + pangenome.select_elem("error") + + def test_metadata_sources(self, add_element_to_pangenome, pangenome): + """Tests the metadata_sources method of the Pangenome class. + + :param add_element_to_pangenome: Add elements to the pangenome + :param pangenome: Access the pangenome object + """ + for metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"]: + assert isinstance(pangenome.metadata_sources(metatype), set) + assert pangenome.metadata_sources(metatype) == {'source'} + + def test_metadata(self, add_element_to_pangenome, pangenome): + """Tests the metadata generator of the Pangenome class. + + :param add_element_to_pangenome: Add elements to the pangenome + :param pangenome: Access the pangenome object + """ + for metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"]: + for metadata_gen in pangenome.metadata(metatype): + for metadata in metadata_gen: + assert isinstance(metadata, Metadata) + assert metadata.source == 'source' + + def test_get_elem_by_metadata(self, add_element_to_pangenome, pangenome): + """Tests the metadata generator filtered by metadata attribute of the Pangenome class. + + :param add_element_to_pangenome: Add elements to the pangenome + :param pangenome: Access the pangenome object + """ + for metatype, expected_type in {"families": GeneFamily, "genomes": Organism, "genes": Gene, "RGPs": Region, + "spots": Spot, "modules": Module}.items(): + for elem in pangenome.get_elem_by_metadata(metatype, attribute="attr"): + assert isinstance(elem, expected_type) + for metadata in elem.metadata: + assert isinstance(metadata, Metadata) + assert metadata.source == 'source' + + def test_get_elem_by_sources(self, add_element_to_pangenome, pangenome): + """Tests the metadata generator filtered by source of the Pangenome class. + + :param add_element_to_pangenome: Add elements to the pangenome + :param pangenome: Access the pangenome object + """ + for metatype, expected_type in {"families": GeneFamily, "genomes": Organism, "genes": Gene, "RGPs": Region, + "spots": Spot, "modules": Module}.items(): + for elem in pangenome.get_elem_by_sources(source='source', metatype=metatype): + assert isinstance(elem, expected_type) + for metadata in elem.metadata: + assert isinstance(metadata, Metadata) + assert metadata.source == 'source' diff --git a/tests/test_region.py b/tests/test_region.py new file mode 100644 index 00000000..16a4d68f --- /dev/null +++ b/tests/test_region.py @@ -0,0 +1,787 @@ +#! /usr/bin/env python3 +# coding: utf8 + +import pytest +from typing import Generator, Set +from random import randint + +from ppanggolin.region import Region, Spot, Module, GeneContext +from ppanggolin.geneFamily import GeneFamily +from ppanggolin.genome import Gene, Contig, Organism + + +@pytest.fixture +def genes() -> Generator[Set[Gene], None, None]: + """Create a set of genes to fill gene families + """ + genes = set() + for i in range(0, randint(11, 20)): + gene = Gene(f"gene_{str(i)}") + gene.fill_annotations(start=10 * i + 1, stop=10 * (i + 1), strand='+', position=i, genetic_code=4) + genes.add(gene) + yield genes + + +@pytest.fixture +def families(genes) -> Generator[Set[GeneFamily], None, None]: + """Create a set of gene families fill with genes to test edges + """ + families = set() + genes = list(genes) + nb_families = randint(2, 10) + nb_genes_per_family = len(genes) // nb_families + idx_fam = 1 + while idx_fam < nb_families: + family = GeneFamily(idx_fam, f"family_{idx_fam}") + idx_genes = 0 + while idx_genes < nb_genes_per_family: + gene = genes[(idx_fam - 1) * nb_genes_per_family + idx_genes] + family.add(gene) + gene.family = family + idx_genes += 1 + families.add(family) + idx_fam += 1 + # last family fill with all the gene left + family = GeneFamily(idx_fam, f"family_{idx_fam}") + idx_genes = (idx_fam - 1) * nb_genes_per_family + while idx_genes < len(genes): + gene = genes[idx_genes] + family.add(gene) + gene.family = family + idx_genes += 1 + families.add(family) + yield families + + +@pytest.fixture +def organisms(genes) -> Generator[Set[Organism], None, None]: + """Create a set of organism object for test + + :return: Generator with a set of organism object + """ + orgs = set() + genes = list(genes) + nb_organisms = randint(2, 10) + nb_genes_per_organism = len(genes) // nb_organisms + idx_org = 1 + while idx_org < nb_organisms: + org = Organism(f"organism_{idx_org}") + idx_genes = 0 + while idx_genes < nb_genes_per_organism: + gene = genes[(idx_org - 1) * nb_genes_per_organism + idx_genes] + gene.fill_parents(organism=org) + idx_genes += 1 + orgs.add(org) + idx_org += 1 + # The last organism fill with all the gene left + org = Organism(f"organism_{idx_org}") + idx_genes = (idx_org - 1) * nb_genes_per_organism + while idx_genes < len(genes): + gene = genes[idx_genes] + gene.fill_parents(organism=org) + idx_genes += 1 + orgs.add(org) + yield orgs + + +class TestRegion: + """Tests for region class + """ + attr_val = {'score': 0, 'starter': None, 'stopper': None} + + @pytest.fixture + def region(self) -> Generator[Region, None, None]: + """Generate a region object to test class + """ + yield Region("RGP") + + def test_cstr(self, region: Region): + """Tests that region is constructed as expected + """ + assert isinstance(region, Region) + assert region.name == "RGP" + assert isinstance(region._genes_getter, dict) + for attr, value in self.attr_val.items(): + assert region.__getattribute__(attr) == value + + def test_add_gene(self, region): + """Tests that genes can be aadded to a region + """ + gene = Gene('gene') + gene.fill_annotations(start=0, stop=10, strand='+', position=0) + region.add(gene) + assert len(region._genes_getter) == 1 + assert region._genes_getter[0] == gene + assert region.starter == gene + assert region.stopper == gene + assert gene.RGP == region + + def test_add_gene_not_is_instance_gene(self, region): + """Test that adding object with instance not Gene return a TypeError + """ + with pytest.raises(TypeError): + region.add(0) + + def test_add_gene_not_fill_with_position(self, region): + """Test that adding gene not fill with position return an AttributeError + """ + with pytest.raises(AttributeError): + region.add(Gene('gene')) + + def test_add_genes_at_position_already_taken(self, region): + """Test that adding genes with same position return a ValueError + """ + gene = Gene('gene') + gene.fill_annotations(start=0, stop=10, strand='+', position=0) + region.add(gene) + with pytest.raises(KeyError): + gene = Gene('gene') + gene.fill_annotations(start=4, stop=12, strand='-', position=0) + region.add(gene) + + def test_add_genes_from_different_contigs(self, region): + """Test that adding genes from different contigs return an Exception + """ + gene1, gene2 = Gene('gene_1'), Gene('gene_2') + gene1.fill_annotations(start=0, stop=10, strand='+', position=0) + gene2.fill_annotations(start=11, stop=20, strand='+', position=1) + gene1.fill_parents(None, Contig('contig_1')) + region.add(gene1) + gene2.fill_parents(None, Contig('contig_2')) + with pytest.raises(Exception): + region.add(gene2) + + def test_add_genes_from_different_organisms(self, region): + """Test that adding genes from different organisms return an Exception + """ + gene1, gene2 = Gene('gene_1'), Gene('gene_2') + gene1.fill_annotations(start=0, stop=10, strand='+', position=0) + gene2.fill_annotations(start=11, stop=20, strand='+', position=1) + gene1.fill_parents(Organism("org_1")) + region.add(gene1) + gene2.fill_parents(Organism("org_2")) + with pytest.raises(Exception): + region.add(gene2) + + def test_get_genes(self, region): + """Tests that genes can be retrieved from the region + """ + gene = Gene('gene') + gene.fill_annotations(start=0, stop=10, strand='+', position=0) + region.add(gene) + assert region.get(0) == gene + + def test_get_genes_with_position_not_integer(self, region): + """Tests that getting a gene with wrong type for position raise a TypeError + """ + with pytest.raises(TypeError): + region.get("0") + + def test_get_genes_with_position_not_in_region(self, region): + """Tests that getting a gene at position not belonging in the region return a KeyError + """ + with pytest.raises(KeyError): + region.get(randint(0, 20)) + + def test_del_gene(self, region): + """Tests that genes can be deleted from the region + """ + gene = Gene('gene') + gene.fill_annotations(start=0, stop=10, strand='+', position=0) + region.add(gene) + assert region.get(0) == gene + region.remove(0) + assert 0 not in region._genes_getter + + def test_del_genes_with_position_not_integer(self, region): + """Tests that removing a gene with wrong type for position raise a TypeError + """ + with pytest.raises(TypeError): + region.remove("0") + + def test_get_length(self, region): + """Tests that the length of the region can be retrieved + """ + gene1, gene2 = Gene('gene_1'), Gene('gene_2') + gene1.fill_annotations(start=0, stop=10, strand='+', position=0) + gene2.fill_annotations(start=11, stop=20, strand='+', position=1) + region.add(gene1) + region.add(gene2) + assert region.length == 20 + + def test_get_organism(self, region): + """Tests that the organism linked to the region can be retrieved + """ + gene = Gene('gene') + gene.fill_annotations(start=0, stop=10, strand='+', position=0) + gene.fill_parents(Organism("org")) + region.add(gene) + assert region.organism.name == 'org' + + def test_get_contig(self, region): + """Tests that the contig linked to the region can be retrieved + """ + gene = Gene('gene') + gene.fill_annotations(start=0, stop=10, strand='+', position=0) + gene.fill_parents(contig=Contig("contig")) + region.add(gene) + assert region.contig.name == 'contig' + + def test_is_whole_contig_true(self, region): + """Tests that the property is_whole_contig return True if the region has the same length as contig + """ + starter, stopper = Gene('starter'), Gene('stopper') + starter.fill_annotations(start=0, stop=10, strand='+', position=0) + stopper.fill_annotations(start=11, stop=20, strand='+', position=1) + contig = Contig("contig") + contig[starter.start], contig[stopper.start] = starter, stopper + starter.fill_parents(None, contig), stopper.fill_parents(None, contig) + region.add(starter), region.add(stopper) + assert region.is_whole_contig is True + + def test_is_whole_contig_false(self, region): + """Tests that the property is_whole_contig return False if the region has not the same length as contig + """ + before, starter, stopper, after = Gene('before'), Gene('starter'), Gene('stopper'), Gene('after') + before.fill_annotations(start=0, stop=10, strand='+', position=0) + starter.fill_annotations(start=11, stop=20, strand='+', position=1) + stopper.fill_annotations(start=21, stop=30, strand='+', position=2) + after.fill_annotations(start=31, stop=40, strand='+', position=3) + contig = Contig("contig") + contig[before.start], contig[after.start] = before, after + contig[starter.start], contig[stopper.start] = starter, stopper + before.fill_parents(None, contig), after.fill_parents(None, contig) + starter.fill_parents(None, contig), stopper.fill_parents(None, contig) + region.add(starter), region.add(stopper) + assert region.is_whole_contig is False + + def test_is_contig_border_true(self, region): + """Test that property is_contig_border return true if the region is bordering the contig + """ + before, starter, stopper, after = Gene('before'), Gene('starter'), Gene('stopper'), Gene('after') + before.fill_annotations(start=0, stop=10, strand='+', position=0) + starter.fill_annotations(start=11, stop=20, strand='+', position=1) + stopper.fill_annotations(start=21, stop=30, strand='+', position=2) + after.fill_annotations(start=31, stop=40, strand='+', position=3) + contig = Contig("contig") + before.fill_parents(None, contig), after.fill_parents(None, contig) + starter.fill_parents(None, contig), stopper.fill_parents(None, contig) + # Test bordering right + contig[before.start], contig[starter.start], contig[stopper.start] = before, starter, stopper + region.add(starter), region.add(stopper) + assert region.is_contig_border is True + # Test bordering left + del contig._genes_position[before.position] + del contig._genes_getter[before.start] + contig[after.start] = after + assert region.is_contig_border is True + + def test_is_contig_border_false(self, region): + """Tests that the property is_contig_border return False if the region is not bordering the contig + """ + before, starter, stopper, after = Gene('before'), Gene('starter'), Gene('stopper'), Gene('after') + before.fill_annotations(start=0, stop=10, strand='+', position=0) + starter.fill_annotations(start=11, stop=20, strand='+', position=1) + stopper.fill_annotations(start=21, stop=30, strand='+', position=2) + after.fill_annotations(start=31, stop=40, strand='+', position=3) + contig = Contig("contig") + contig[before.start], contig[after.start] = before, after + contig[starter.start], contig[stopper.start] = starter, stopper + before.fill_parents(None, contig), after.fill_parents(None, contig) + starter.fill_parents(None, contig), stopper.fill_parents(None, contig) + region.add(starter), region.add(stopper) + assert region.is_contig_border is False + + def test_is_contig_border_assertion_error_if_no_gene(self, region): + """Tests that an AssertionError is returned if there is no gene in the region + """ + with pytest.raises(AssertionError): + _ = region.is_contig_border + + def test_len(self, region, genes): + """Tests that the expected number of genes is retrieved in the region + """ + for gene in genes: + region.add(gene) + assert isinstance(len(region), int) + assert len(region) == len(genes) + + def test_equality(self, genes): + """Test equality between two regions + """ + region_1, region_2, region_3 = Region("RGP_1"), Region("RGP_2"), Region("RGP_3") + max_pos = max(genes, key=lambda gene: gene.position).position + for gene in genes: + region_1.add(gene) + region_2.add(gene) + region_3[max_pos - gene.position + 1] = gene + assert region_1 == region_2 + assert region_1 == region_3 + + def test_not_equal(self, region, genes): + """Test difference between two regions + """ + for gene in genes: + region.add(gene) + assert region != Region("other_RGP") + + def test_equality_with_not_instance_region(self, region): + """Test comparison between a region and another object raise a TypeError + """ + with pytest.raises(TypeError): + assert region == 4 + + def test_get_gene_families(self, region, genes, families): + """Tests that gene families can be retrieved from the region + """ + for gene in genes: + region.add(gene) + assert all(isinstance(family, GeneFamily) for family in region.families) + assert set(region.families) == families + + def test_get_number_of_gene_families(self, region, genes, families): + """Tests that gene families can be retrieved from the region + """ + for gene in genes: + region.add(gene) + assert isinstance(region.number_of_families, int) + assert region.number_of_families == len(families) + + # def test_get_bordering_genes(self, region, genes): + # # TODO test multigenic + # contig = Contig("contig") + # for gene in genes: + # contig[gene.start] = gene + # gene.fill_parents(None, contig) + # region[gene.position] = gene + # min_gene, max_gene = min(genes, key=lambda gene: gene.position), max(genes, key=lambda gene: gene.position) + # assert region.get_bordering_genes(1, {}) == [[min_gene], [max_gene]] + + +class TestSpot: + @pytest.fixture + def spot(self) -> Generator[Spot, None, None]: + """Generate a spot for test + """ + yield Spot(0) + + def test_cstr(self, spot): + """Tests that spot is constructed as expected + """ + assert spot.ID == 0 + assert isinstance(spot._region_getter, dict) and len(spot._region_getter) == 0 + assert isinstance(spot._uniqOrderedSet, dict) and len(spot._uniqOrderedSet) == 0 + assert isinstance(spot._uniqContent, dict) and len(spot._uniqContent) == 0 + + def test_cstr_type_error(self): + """Tests that TypeError is returned if identifier is not an integer + """ + with pytest.raises(TypeError): + Spot("spot_0") + + def test_repr(self, spot): + """Test that the canonical string representing a spot does not change + """ + assert repr(spot) == "Spot 0 - #RGP: 0" + + def test_str(self, spot): + """Test that the writing spot method does not change + """ + assert str(spot) == "spot_0" + + @pytest.fixture + def region(self) -> Generator[Region, None, None]: + """Create a region for test + """ + yield Region("RGP_0") + + def test_add_region(self, spot, region): + """Tests that adding a Region object to the Spot object works as expected + """ + spot.add(region) + assert region == spot._region_getter[region.name] + + def test_add_not_instance_region(self, spot): + """Tests that a TypeError is returned if a non-region type is trying to be added + """ + with pytest.raises(TypeError): + spot.add("region") + + def test_add_different_region_with_same_name(self, spot): + """Test that adding a new Region same name than another in the spot return a KeyError + """ + region_1, region_2 = Region("RGP"), Region("RGP") + gene_1, gene_2 = Gene("gene_1"), Gene("gene_2") + gene_1.fill_annotations(start=0, stop=10, strand='+', position=0) + gene_2.fill_annotations(start=0, stop=10, strand='+', position=0) + gene_1.family, gene_2.family = GeneFamily(0, "Fam_0"), GeneFamily(1, "Fam_1") + region_1[0], region_2[0] = gene_1, gene_2 + spot[region_1.name] = region_1 + with pytest.raises(KeyError): + spot[region_2.name] = region_2 + + def test_add_two_time_the_same_region(self, spot, region): + """Test that adding a two time the same region is working as expected + """ + gene = Gene("gene") + gene.fill_annotations(start=0, stop=10, strand='+', position=0) + gene.family = GeneFamily(0, "Fam") + region[0] = gene + spot[region.name] = region + assert region in spot._region_getter.values() + spot[region.name] = region + assert region in spot._region_getter.values() + + def test_get_region(self, spot, region): + """Tests that getting the region in the Spot object works as expected + """ + spot.add(region) + assert spot.get(region.name) == region + + def test_get_region_not_in_spot(self, spot): + """Tests that a KeyError is raised when the name of the region does not exist in the spot + """ + with pytest.raises(KeyError): + _ = spot["rgp"] + + def test_delete_region_in_spot(self, spot, region): + """Tests that remove a region from the spot work as expected + """ + spot[region.name] = region + del spot[region.name] + assert region.name not in spot._region_getter + + def test_len(self, spot, region): + """Tests that getting the number of regions work as expected + """ + assert isinstance(len(spot), int) + assert len(spot) == 0 + spot[region.name] = region + assert len(spot) == 1 + + @pytest.fixture + def regions(self, genes): + """Create a random number of regions fill with genes + """ + regions = set() + genes = sorted(list(genes), key=lambda x: x.position) + nb_regions = randint(2, len(genes)) + nb_genes_per_region = len(genes) // nb_regions + idx_region = 1 + while idx_region < nb_regions: + region = Region(f"RGP_{idx_region}") + genes_counter = 0 + while genes_counter < nb_genes_per_region: + gene = genes.pop(0) + region[gene.position] = gene + gene.RGP = region + genes_counter += 1 + regions.add(region) + idx_region += 1 + # last region fill with all the gene left + region = Region(f"RGP_{idx_region}") + while len(genes) > 0: + gene = genes.pop(0) + region[gene.position] = gene + gene.RGP = region + regions.add(region) + yield regions + + def test_get_all_regions(self, spot, regions): + """Tests that getting all the region in the spot works as expected + """ + for region in regions: + spot[region.name] = region + assert len(spot) == len(regions) + assert all(type(region) == Region for region in spot.regions) + assert regions == set(spot.regions) + + def test_get_families(self, spot, regions, families): + """Tests that getting the gene families in the Spot object works as expected + """ + for region in regions: + spot[region.name] = region + assert set(spot.families) == families + + def test_number_of_families(self, spot, regions, families): + """Tests that getting the number of families in the spot works as expected + """ + for region in regions: + spot[region.name] = region + assert isinstance(spot.number_of_families, int) + assert spot.number_of_families == len(families) + + def test_add_spot_to_families(self, spot, regions, families): + """Tests that adding spot to families works as expected + """ + for region in regions: + spot[region.name] = region + spot.spot_2_families() + assert all(set(family.spots) == {spot} for family in spot.families) + + @pytest.fixture + def srgps(self, regions): + """Create a random number of same rgp for all regions + """ + srgps = set() + for region in regions: + nb_sim_rgp = randint(1, 3) + for idx_sim_rgp in range(1, nb_sim_rgp + 1): + sim_rgp = Region(f"s{region.name}.{idx_sim_rgp}") + for gene in region.genes: + sim_rgp[gene.position] = gene + srgps.add(sim_rgp) + yield srgps + + def test_get_uniq_rgp_set(self, spot, regions, families, srgps): + """Tests that getting identical rgp in the Spot object works as expected + """ + for region in list(regions) + list(srgps): # With lists provide sRGP to be key RGP in dict + spot[region.name] = region + assert len(spot) == len(regions) + len(srgps) + uniq2rgp = spot.get_uniq_to_rgp() + for region, sim_rgps in uniq2rgp.items(): + assert region in regions + assert set(region.families) == set.union(*[set(srgp.families) for srgp in sim_rgps]) + + def test_get_uniq_ordered_set(self, spot, regions, families, srgps): + """Tests that getting the unique synteny in the Spot object works as expected + """ + for region in list(regions) + list(srgps): # With lists provide sRGP to be key RGP in dict + spot[region.name] = region + assert len(spot) == len(regions) + len(srgps) + assert spot.get_uniq_ordered_set().issubset(regions) + + def test_get_uniq_content(self, spot, regions, families, srgps): + """Tests that getting the unique RGP in the Spot object works as expected + """ + for region in list(regions) + list(srgps): # With lists provide sRGP to be key RGP in dict + spot[region.name] = region + assert len(spot) == len(regions) + len(srgps) + assert spot.get_uniq_ordered_set().issubset(regions) + + +class TestModule: + @pytest.fixture + def module(self): + """Create a basic module + """ + yield Module(0) + + def test_cstr(self, module): + """Test that a module is construct as expected + """ + assert module.ID == 0 + assert isinstance(module._families_getter, dict) and module._families_getter == {} + + def test_cstr_type_error(self): + """Test that if the identifier is not an integer it raises a TypeError + """ + with pytest.raises(TypeError): + Spot("mod_0") + + def test_repr(self, module): + """Test that the canonical string representing a module does not change + """ + assert repr(module) == "Module 0 - #Families: 0" + + def test_str(self, module): + """Test that the writing spot method does not change + """ + assert str(module) == "module_0" + + def test_hash(self, module): + """Test that len method work as expected + """ + assert isinstance(hash(module), int) + + def test_len(self, module): + """Test that len method work as expected + """ + module._families_getter["fam"] = GeneFamily(randint(1,5), "fam") + assert isinstance(len(module), int) + assert len(module) == 1 + + def test_eq(self, families): + """Test equality between modules + """ + module1, module2, module3 = Module(1), Module(2), Module(3) + for family in families: + module1[family.name] = family + module2[family.name] = family + assert module1 == module2 + assert module1 != module3 + + def test_eq_with_is_not_instance_module(self, module): + """Test comparison between a module and another object raise a TypeError + """ + with pytest.raises(TypeError): + assert module == 4 + + @pytest.fixture + def family(self) -> Generator[GeneFamily, None, None]: + """Create a basic gene family for test + """ + yield GeneFamily(0, 'family') + + def test_add_family(self, module, family): + """Tests that a gene family can be added to the module + """ + module[family.name] = family + assert len(module._families_getter) == 1 + assert module._families_getter['family'] == family + + def test_add_different_families_with_same_name(self, module): + """Test that adding a new family with the same name as another in the module return a KeyError + """ + family_1, family_2 = GeneFamily(1, 'family_1'), GeneFamily(1, 'family_1') + module[family_1.name] = family_1 + with pytest.raises(KeyError): + module[family_2.name] = family_2 + + def test_add_two_time_the_same_family(self, module, family): + """Test that adding a two time the same family is working as expected + """ + module[family.name] = family + assert family in module._families_getter.values() + module[family.name] = family + assert family in module._families_getter.values() + + def test_get_family(self, module, family): + """Tests that a gene family can be retrieved from the module + """ + module[family.name] = family + assert module['family'] == family + + def test_get_family_which_does_not_exist(self, module): + """Tests that if a gene family does not exist it raises a KeyError""" + fam = GeneFamily(randint(1, 20), f"fam{randint(1, 20)}") + with pytest.raises(KeyError): + _ = module[fam.name] + + def test_delete_family(self, module, family): + """Tests that a gene family can be deleted from the module + """ + module[family.name] = family + del module['family'] + assert len(module) == 0 + + def test_delete_family_which_does_not_exist(self, module): + """Tests that if a gene family does not exist it raises a KeyError + """ + fam = GeneFamily(randint(1, 20), f"fam{randint(1, 20)}") + with pytest.raises(KeyError): + del module[fam.name] + + +class TestGeneContext: + @pytest.fixture + def context(self): + """Generate a basic context + """ + yield GeneContext(0) + + def test_cstr(self, context): + """Test that a gene context is construct as expected + """ + assert context.ID == 0 + assert isinstance(context._families_getter, dict) and context._families_getter == {} + + def test_cstr_type_error(self): + """Test that if the identifier is not an integer it raises a TypeError + """ + with pytest.raises(TypeError): + Spot("gc_0") + + def test_repr(self, context): + """Test that the canonical string representing a context does not change + """ + assert repr(context) == "Context 0 - #Families: 0" + + def test_str(self, context): + """Test that the writing spot method does not change + """ + assert str(context) == "GC_0" + + def test_hash(self, context): + """Test that len method work as expected + """ + assert isinstance(hash(context), int) + + def test_len(self, context): + """Test that len method work as expected + """ + context._families_getter["fam"] = GeneFamily(randint(1, 5), "fam") + assert isinstance(len(context), int) + assert len(context) == 1 + + def test_eq(self, families): + """Test equality between two contexts + """ + context1, context2, context3 = GeneContext(1), GeneContext(2), GeneContext(3) + for family in families: + context1[family.name] = family + context2[family.name] = family + assert context1 == context2 + assert context1 != context3 + + def test_eq_with_is_not_instance_context(self, context): + """Test comparison between a context and another object raise a TypeError + """ + with pytest.raises(TypeError): + assert context == 4 + + @pytest.fixture + def family(self) -> Generator[GeneFamily, None, None]: + """Create a basic gene family for test + """ + yield GeneFamily(0, 'family') + + def test_add_family(self, context, family): + """Tests that a gene family can be added to the context + """ + context[family.name] = family + assert len(context._families_getter) == 1 + assert context._families_getter['family'] == family + + def test_add_different_families_with_same_name(self, context): + """Test that adding a new family with the same name as another in the context return a KeyError + """ + family_1, family_2 = GeneFamily(1, 'family_1'), GeneFamily(1, 'family_1') + context[family_1.name] = family_1 + with pytest.raises(KeyError): + context[family_2.name] = family_2 + + def test_add_two_time_the_same_family(self, context, family): + """Test that adding a two time the same family is working as expected + """ + context[family.name] = family + assert family in context._families_getter.values() + context[family.name] = family + assert family in context._families_getter.values() + + def test_get_family(self, context, family): + """Tests that a gene family can be retrieved from the context + """ + context[family.name] = family + assert context['family'] == family + + def test_get_family_which_does_not_exist(self, context): + """Tests that if a gene family does not exist it raises a KeyError""" + fam = GeneFamily(randint(1, 20), f"fam{randint(1, 20)}") + with pytest.raises(KeyError): + _ = context[fam.name] + + def test_delete_family(self, context, family): + """Tests that a gene family can be deleted from the context + """ + context[family.name] = family + del context['family'] + assert len(context) == 0 + + def test_delete_family_which_does_not_exist(self, context): + """Tests that if a gene family does not exist it raises a KeyError + """ + fam = GeneFamily(randint(1, 20), f"fam{randint(1, 20)}") + with pytest.raises(KeyError): + del context[fam.name] From d66cd3f91cedea338ccfa3ca06790d7472ec9441 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 7 Sep 2023 15:54:13 +0200 Subject: [PATCH 095/173] fix method number_of_contigs called as an attribute --- tests/test_genome.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_genome.py b/tests/test_genome.py index 4b511b98..c060e79d 100644 --- a/tests/test_genome.py +++ b/tests/test_genome.py @@ -523,7 +523,7 @@ def test_number_of_contigs(self, organism): """ organism.add(Contig('contig1')) organism.add(Contig('contig2')) - assert organism.number_of_contigs() == 2 + assert organism.number_of_contigs == 2 def test_get_families(self, organism, contig, gene): """Tests that gene families in an organism can be retrieved From 62219d30b6268d41a5d747339a8889323f55a824 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 7 Sep 2023 16:52:34 +0200 Subject: [PATCH 096/173] fix region writting fct --- ppanggolin/projection/projection.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 869e9d3b..7ec39167 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -238,14 +238,14 @@ def summarize_projection(input_organism:Organism, pangenome:Pangenome, input_org "Genes": gene_count, "Families": families_count, "Persistent": {"genes":persistent_gene_count, "families":persistent_family_count}, - "Shell": {"genes":persistent_gene_count, "families":persistent_family_count}, - "Cloud": {"genes":persistent_gene_count, "families":persistent_family_count, "singleton families":singleton_gene_count}, + "Shell": {"genes":shell_gene_count, "families":shell_family_count}, + "Cloud": {"genes":cloud_gene_count, "families":cloud_family_count - singleton_gene_count, "specific families":singleton_gene_count}, "RGPs": rgp_count, "Spots": spot_count, "New spots": new_spot_count, "Modules": module_count } - yaml_string = yaml.dump(summary_info, default_flow_style=False, sort_keys=False) + yaml_string = yaml.dump(summary_info, default_flow_style=False, sort_keys=False, indent=4) # summary_str = '\n'.join((f' - {k}: {v}' for k,v in summary_info )) From 42007f985caadd7bf3f41abc71d7a446302504dd Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 7 Sep 2023 17:12:17 +0200 Subject: [PATCH 097/173] add timer to evaluate mmseqs search --- ppanggolin/align/alignOnPang.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 2382b461..0214b5e8 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -2,6 +2,7 @@ # coding:utf-8 # default libraries +import time from _io import TextIOWrapper import logging import tempfile @@ -106,9 +107,15 @@ def align_seq_to_pang(target_seq_file:Path , query_seq_file: Path, output: Path, cmd = ["mmseqs", "search", query_db.as_posix(), target_db.as_posix(), aln_db.name, tmpdir.as_posix(), "-a", "--min-seq-id", str(identity), "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu), "--max-accept", str(1)] + + logging.getLogger().info("Aligning sequences") logging.getLogger().debug(" ".join(cmd)) + + start = time.time() subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + align_time = time.time() - start + logging.getLogger().info(f"Done aligning sequences in {round(align_time,2)} seconds") with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, prefix="aln_result_db_file", suffix = ".tsv", delete=False) as outfile: cmd = ["mmseqs", "convertalis", query_db.as_posix(), target_db.as_posix(), aln_db.name, outfile.name, "--format-mode", "2"] From 1b92f2647d202f412e7787a5945c1ddf7d6e01c3 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 7 Sep 2023 17:28:14 +0200 Subject: [PATCH 098/173] fix find module fct with refactoring changes --- ppanggolin/projection/projection.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 7ec39167..b9a03e3a 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -367,7 +367,7 @@ def write_predicted_regions(regions: Set[Region], writer.writeheader() regions = sorted(regions, key=lambda x: ( - x.organism.name, x.contig.name, x.starter)) + x.organism.name, x.contig.name, x.ID)) for region in regions: row = { "region": region.name, @@ -403,7 +403,7 @@ def write_rgp_to_spot_table(rgp_to_spots: Dict[Region, Set[str]], output: Path, writer.writeheader() regions = sorted(rgp_to_spots.keys(), key=lambda x: ( - x.organism.name, x.contig.name, x.starter)) + x.organism.name, x.contig.name, x.ID)) for region in regions: row = { "region": region.name, @@ -686,13 +686,15 @@ def project_and_write_modules(pangenome: Pangenome, input_organism: Organism, """ Write a tsv file providing association between modules and the input organism + :param pangenome: Pangenome object + :param input_organism: the organism that is being annotated :param output: Path to output directory :param compress: Compress the file in .gz """ output_file = output / "modules_in_input_organism.tsv" - input_organism_families = input_organism.families + input_organism_families = list(input_organism.families) counter = 0 modules_in_input_org = [] with write_compressed_or_not(output_file, compress) as fout: From 0d6f70776534a17403d32822a403730840bb83a1 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 8 Sep 2023 11:31:30 +0200 Subject: [PATCH 099/173] deal with rare case when id is already a family name in pan but also a specific gene --- ppanggolin/projection/projection.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index b9a03e3a..e17383cd 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -309,7 +309,23 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org gene_family = seqid_to_gene_family[gene_id] gene_family.add(gene) except KeyError: - new_gene_family = GeneFamily(pangenome.max_fam_id, gene_id) + # the seqid is not in the dict so it does not align with any pangenome families + # We consider it as cloud gene + try: + # in some case a family exists already and has the same name of the gene id + # So gene id cannot be used + _ = pangenome.get_gene_family(gene_id) + except KeyError: + new_gene_family = GeneFamily(pangenome.max_fam_id, gene_id) + + else: + # gene id already exists. + new_name=f"{input_organism.name}_{gene_id}" + logging.getLogger('PPanGGOLiN').warning('The input organism as a specific gene that does not align to any ' + f'pangenome families with the same id ({gene_id}) than an existing gene family in the pangenome. ' + f'The organism name is added to the family name: {new_name}') + new_gene_family = GeneFamily(pangenome.max_fam_id, new_name) + pangenome.add_gene_family(new_gene_family) new_gene_family.add(gene) new_gene_family.partition = "Cloud" From e6fb0a8dd319b42b1e6d9054815d09b2a7d8c705 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 12 Sep 2023 17:09:42 +0200 Subject: [PATCH 100/173] add disable bar in align fct --- ppanggolin/align/alignOnPang.py | 7 ++++--- ppanggolin/context/searchGeneContext.py | 2 +- ppanggolin/projection/projection.py | 9 +++++---- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 0214b5e8..b468f94a 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -445,7 +445,7 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_file: Path, def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_file: Path, output: Path, tmpdir: Path, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, - translation_table: int = 11,) -> Tuple[set, str, dict]: + translation_table: int = 11, disable_bar: bool = False,) -> Tuple[set, str, dict]: """ Assign gene families from a pangenome to input sequences. @@ -461,6 +461,7 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_file: Path, :param identity: Minimum identity threshold for the alignment (default: 0.8). :param coverage: Minimum coverage threshold for the alignment (default: 0.8). :param translation_table: Translation table to use if sequences need to be translated (default: 11). + :param disable_bar: Disable progress bar :return: A tuple containing the set of input sequences, the path to the alignment result file, and a dictionary mapping input sequences to gene families. @@ -470,7 +471,7 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_file: Path, with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, prefix="all_pangenome_genes", suffix=".fna") as tmp_pang_file: - write_all_gene_sequences(pangenome, tmp_pang_file, add="ppanggolin_") + write_all_gene_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar) with read_compressed_or_not(sequence_file) as seqFileObj: seq_set, is_nucleotide = get_seq_ids(seqFileObj) @@ -539,7 +540,7 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo seq_set, align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_file=sequence_file, output=output, tmpdir=new_tmpdir, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, - translation_table=translation_table) + translation_table=translation_table, disable_bar=disable_bar) if getinfo or draw_related: # TODO Add getinfo to function and remove if get_seq_info(seq2pang, pangenome, output, draw_related, disable_bar=disable_bar) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 339c271e..07edd0f0 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -69,7 +69,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: output=output, tmpdir=new_tmpdir, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, - translation_table=translation_table) + translation_table=translation_table, disable_bar=disable_bar) project_and_write_partition(seq2pan, seq_set, output) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index e17383cd..2dbd372f 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -153,7 +153,8 @@ def launch(args: argparse.Namespace): output=output_dir, cpu=args.cpu,use_representatives=args.fast, no_defrag=args.no_defrag, identity=args.identity, coverage=args.coverage, tmpdir=args.tmpdir, - translation_table=args.translation_table, keep_tmp=args.keep_tmp) + translation_table=args.translation_table, keep_tmp=args.keep_tmp, + disable_bar=args.disable_prog_bar) input_org_rgps, input_org_spots, input_org_modules = None, None, None @@ -256,7 +257,7 @@ def summarize_projection(input_organism:Organism, pangenome:Pangenome, input_org def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_organism: Organism, output: Path, cpu: int,use_representatives:bool, no_defrag: bool, identity: float, coverage: float, tmpdir: Path, - translation_table: int, keep_tmp:bool = False): + translation_table: int, keep_tmp:bool = False, disable_bar: bool =False): """ Annotate input genes with pangenome gene families and perform clustering. @@ -269,9 +270,9 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org :param identity: Minimum identity threshold for gene clustering. :param coverage: Minimum coverage threshold for gene clustering. :param tmpdir: Temporary directory for intermediate files. - :param disable_bar: Whether to disable progress bar. :param translation_table: Translation table ID for nucleotide sequences. :param keep_tmp: If True, keep temporary files. + :param disable_bar: Whether to disable progress bar. :return: Number of genes that do not cluster with any of the gene families of the pangenome. """ @@ -296,7 +297,7 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org seq_set, _, seqid_to_gene_family = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_file=seq_fasta_file, output=output, tmpdir=new_tmpdir, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, - translation_table=translation_table) + translation_table=translation_table, disable_bar=disable_bar) project_and_write_partition(seqid_to_gene_family, seq_set, output) From 5873e24a29459e9503f42e80f3ecef5d332c3d0c Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 12 Sep 2023 17:13:48 +0200 Subject: [PATCH 101/173] adjust mmseqs param to prevent missing short protein --- ppanggolin/align/alignOnPang.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index b468f94a..ed285c3b 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -102,11 +102,12 @@ def align_seq_to_pang(target_seq_file:Path , query_seq_file: Path, output: Path, cov_mode = "0" # coverage of query and target # mmseqs search command + # see https://github.com/soedinglab/MMseqs2/issues/373 Using a combination of param to no miss short proteins with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), prefix="aln_result_db_file", suffix=".aln.DB", delete=False) as aln_db: - cmd = ["mmseqs", "search", query_db.as_posix(), target_db.as_posix(), aln_db.name, tmpdir.as_posix(), "-a", "--min-seq-id", str(identity), - "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu), "--max-accept", str(1)] - + cmd = ["mmseqs", "search", query_db.as_posix(), target_db.as_posix(), aln_db.name, tmpdir.as_posix(), "-a", "--min-seq-id", str(identity), + "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu), + "--seed-sub-mat", "VTML40.out", "-s", "2", '--comp-bias-corr', "0", "--mask", "0", "-e", "1"] logging.getLogger().info("Aligning sequences") From dff45f72c99065a8fe880b5dc5856a0612a3e024 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 13 Sep 2023 10:22:19 +0200 Subject: [PATCH 102/173] reorganise part of alignment fct --- ppanggolin/align/alignOnPang.py | 64 ++++++++++++++----------- ppanggolin/context/searchGeneContext.py | 17 ++++--- ppanggolin/projection/projection.py | 14 +++--- 3 files changed, 52 insertions(+), 43 deletions(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index ed285c3b..eb72c40e 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -12,6 +12,9 @@ from typing import List, Tuple, Set, Dict, IO, Iterator from pathlib import Path + +from tqdm import tqdm + # local libraries from ppanggolin.formats import check_pangenome_info from ppanggolin.geneFamily import GeneFamily @@ -230,15 +233,16 @@ def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool]: -def write_gene_fam_sequences(pangenome: Pangenome, file_obj: IO, add: str = ""): +def write_gene_fam_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", disable_bar:bool=False): """ Export the sequence of gene families :param pangenome: Pangenome containing families :param file_obj: Temporary file where sequences will be written :param add: Add prefix to sequence name + :param disable_bar: disable progress bar """ - for fam in pangenome.gene_families: + for fam in tqdm(pangenome.gene_families, unit="families", disable=disable_bar, total=pangenome.number_of_gene_families): file_obj.write(">" + add + fam.name + "\n") file_obj.write(fam.sequence + "\n") # file_obj.flush() @@ -250,6 +254,8 @@ def write_all_gene_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", :param pangenome: Pangenome containing genes :param file_obj: Temporary file where sequences will be written :param add: Add prefix to sequence name + :param disable_bar: disable progress bar + """ if pangenome.status["geneSequences"] == "inFile": @@ -400,9 +406,9 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel logging.getLogger("PPanGGOLiN").info(f"File listing RGP and spots where sequences of interest are located : " f"{output / 'info_input_seq.tsv'}") -def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_file: Path, output: Path, tmpdir: Path, +def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_file: Path, output: Path, tmpdir: Path, is_input_seq_nt:bool, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, - coverage: float = 0.8, translation_table: int = 11) -> Tuple[set, str, dict]: + coverage: float = 0.8, translation_table: int = 11, disable_bar:bool = False) -> Tuple[set, str, dict]: """ Assign gene families from a pangenome to input sequences. @@ -414,13 +420,15 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_file: Path, :param sequence_file: Path to a FASTA file containing input sequences to align. :param output: Path to the output directory where alignment results will be stored. :param tmpdir: Temporary directory for intermediate files. + :param is_input_seq_nt: Is input sequence file nucleotide sequences. :param cpu: Number of CPU cores to use for the alignment (default: 1). :param no_defrag: If True, the defragmentation workflow is skipped (default: False). :param identity: Minimum identity threshold for the alignment (default: 0.8). :param coverage: Minimum coverage threshold for the alignment (default: 0.8). :param translation_table: Translation table to use if sequences need to be translated (default: 11). + :param disable_bar: If True, disable the progress bar. - :return: A tuple containing the set of input sequences, the path to the alignment result file, + :return: A tuple containing the path to the alignment result file, and a dictionary mapping input sequences to gene families. """ @@ -428,25 +436,22 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_file: Path, with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, prefix="representative_genes", suffix=".faa") as tmp_pang_file: - write_gene_fam_sequences(pangenome, tmp_pang_file, add="ppanggolin_") - - with read_compressed_or_not(sequence_file) as seqFileObj: - seqids_set, is_nucleotide = get_seq_ids(seqFileObj) + logging.getLogger().debug(f'Write gene family sequences in {tmp_pang_file.name}') + write_gene_fam_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar) align_file = align_seq_to_pang(target_seq_file=Path(tmp_pang_file.name), query_seq_file=sequence_file, output=output, tmpdir=tmpdir, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, - is_query_nt=is_nucleotide, is_target_nt=False, + is_query_nt=is_input_seq_nt, is_target_nt=False, translation_table=translation_table) seq2pang, align_file = map_input_gene_to_family_rep_aln(align_file, output, pangenome) - return seqids_set, align_file, seq2pang + return align_file, seq2pang - -def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_file: Path, output: Path, tmpdir: Path, - cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, - translation_table: int = 11, disable_bar: bool = False,) -> Tuple[set, str, dict]: +def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_file: Path, output: Path, tmpdir: Path, is_input_seq_nt:bool, + cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, + translation_table: int = 11, disable_bar:bool = False) -> Tuple[set, str, dict]: """ Assign gene families from a pangenome to input sequences. @@ -457,36 +462,34 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_file: Path, :param sequence_file: Path to a FASTA file containing input sequences to align. :param output: Path to the output directory where alignment results will be stored. :param tmpdir: Temporary directory for intermediate files. + :param is_input_seq_nt: Is input sequence file nucleotide sequences. :param cpu: Number of CPU cores to use for the alignment (default: 1). :param no_defrag: If True, the defragmentation workflow is skipped (default: False). :param identity: Minimum identity threshold for the alignment (default: 0.8). :param coverage: Minimum coverage threshold for the alignment (default: 0.8). :param translation_table: Translation table to use if sequences need to be translated (default: 11). - :param disable_bar: Disable progress bar + :param disable_bar: If True, disable the progress bar. - :return: A tuple containing the set of input sequences, the path to the alignment result file, + :return: A tuple containing the path to the alignment result file, and a dictionary mapping input sequences to gene families. """ with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, prefix="all_pangenome_genes", suffix=".fna") as tmp_pang_file: - + + logging.getLogger().debug(f'Write all pangenome gene sequences in {tmp_pang_file.name}') write_all_gene_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar) - with read_compressed_or_not(sequence_file) as seqFileObj: - seq_set, is_nucleotide = get_seq_ids(seqFileObj) - align_file = align_seq_to_pang(target_seq_file=Path(tmp_pang_file.name), query_seq_file=sequence_file, output=output, tmpdir=tmpdir, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, - is_query_nt=is_nucleotide, is_target_nt=True, + is_query_nt=is_input_seq_nt, is_target_nt=True, translation_table=translation_table ) seq2pang, align_file = map_input_gene_to_family_all_aln(align_file, output, pangenome) - return seq_set, align_file, seq2pang - + return align_file, seq2pang def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: float = 0.8, coverage: float = 0.8, no_defrag: bool = False, cpu: int = 1, getinfo: bool = False, @@ -529,17 +532,20 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo need_spots=True, need_modules=need_mod, disable_bar=disable_bar) else: check_pangenome_info(pangenome, need_families=True, disable_bar=disable_bar) + + with read_compressed_or_not(sequence_file) as seqFileObj: + seq_set, is_nucleotide = get_seq_ids(seqFileObj) with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: if use_representatives: - seq_set, align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir, - cpu, no_defrag, identity=identity, coverage=coverage, - translation_table=translation_table) + align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir, is_input_seq_nt=is_nucleotide, + cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, + translation_table=translation_table, disable_bar=disable_bar) else: - seq_set, align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_file=sequence_file, - output=output, tmpdir=new_tmpdir, + align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_file=sequence_file, + output=output, tmpdir=new_tmpdir, is_input_seq_nt=is_nucleotide, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, disable_bar=disable_bar) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 07edd0f0..d75cee64 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -17,10 +17,10 @@ # local libraries from ppanggolin.formats import check_pangenome_info from ppanggolin.genome import Gene, Contig -from ppanggolin.utils import mk_outdir, restricted_float, add_gene, connected_components, create_tmpdir +from ppanggolin.utils import mk_outdir, restricted_float, add_gene, connected_components, create_tmpdir, read_compressed_or_not from ppanggolin.geneFamily import GeneFamily from ppanggolin.pangenome import Pangenome -from ppanggolin.align.alignOnPang import project_and_write_partition, get_input_seq_to_family_with_rep, get_input_seq_to_family_with_all +from ppanggolin.align.alignOnPang import project_and_write_partition, get_input_seq_to_family_with_rep, get_input_seq_to_family_with_all, get_seq_ids from ppanggolin.region import GeneContext @@ -58,15 +58,18 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: fam_2_seq = None if sequence_file is not None: # Alignment of sequences on pangenome families + with read_compressed_or_not(sequence_file) as seqFileObj: + seq_set, is_nucleotide = get_seq_ids(seqFileObj) + with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: if use_representatives: - seq_set, _, seq2pan = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir, - cpu, no_defrag, identity=identity, coverage=coverage, - translation_table=translation_table) + _, seq2pan = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir, is_input_seq_nt=is_nucleotide, + cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, + translation_table=translation_table, disable_bar=disable_bar) else: - seq_set, _, seq2pan = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_file=sequence_file, - output=output, tmpdir=new_tmpdir, + _, seq2pan = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_file=sequence_file, + output=output, tmpdir=new_tmpdir, is_input_seq_nt=is_nucleotide, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, disable_bar=disable_bar) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 2dbd372f..6a5dd6bc 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -58,7 +58,7 @@ def launch(args: argparse.Namespace): output_dir = Path(args.output) mk_outdir(output_dir, args.force) - # For the moment this element of the pangenome are predicted by default + # For the moment these elements of the pangenome are predicted by default project_modules = True predict_rgp = True project_spots = True @@ -150,7 +150,7 @@ def launch(args: argparse.Namespace): pangenome.add_organism(input_organism) singleton_gene_count = annotate_input_genes_with_pangenome_families(pangenome, input_organism=input_organism, - output=output_dir, cpu=args.cpu,use_representatives=args.fast, + output=output_dir, cpu=args.cpu, use_representatives=args.fast, no_defrag=args.no_defrag, identity=args.identity, coverage=args.coverage, tmpdir=args.tmpdir, translation_table=args.translation_table, keep_tmp=args.keep_tmp, @@ -284,18 +284,18 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org with open(seq_fasta_file, "w") as fh_out_faa: write_gene_sequences_from_annotations( input_organism.genes, fh_out_faa, disable_bar=True, add="ppanggolin_") - + seq_set = {gene.ID if gene.local_identifier == "" else gene.local_identifier for gene in input_organism.genes} with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: if use_representatives: - seq_set, _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome, seq_fasta_file, output, new_tmpdir, - cpu, no_defrag, identity=identity, coverage=coverage, translation_table=translation_table) + _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome, seq_fasta_file, output, new_tmpdir, is_input_seq_nt=True, + cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table) else: - seq_set, _, seqid_to_gene_family = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_file=seq_fasta_file, - output=output, tmpdir=new_tmpdir, + _, seqid_to_gene_family = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_file=seq_fasta_file, + output=output, tmpdir=new_tmpdir, is_input_seq_nt=True, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, disable_bar=disable_bar) From 1d678c91fe1bfd2dabae9e7bd7c7992633e05559 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 14 Sep 2023 12:01:39 +0200 Subject: [PATCH 103/173] give possibility to process mulitple input genomes --- ppanggolin/projection/projection.py | 298 +++++++++++++++++++++++----- 1 file changed, 246 insertions(+), 52 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 6a5dd6bc..fd1212e1 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -21,7 +21,7 @@ # # local libraries from ppanggolin.annotate.synta import annotate_organism, read_fasta, get_dna_sequence -from ppanggolin.annotate.annotate import read_anno_file +from ppanggolin.annotate.annotate import read_anno_file, launch_read_anno, launch_annotate_organism from ppanggolin.annotate import subparser as annotate_subparser from ppanggolin.pangenome import Pangenome # from ppanggolin.genome import input_organism, Gene, RNA, Contig @@ -106,46 +106,40 @@ def launch(args: argparse.Namespace): logging.getLogger().info('Retrieving parameters from the provided pangenome file.') pangenome_params = argparse.Namespace( **{step: argparse.Namespace(**k_v) for step, k_v in pangenome.parameters.items()}) - - if args.organism_name in [org.name for org in pangenome.organisms]: - raise NameError( - f"The provided organism name '{args.organism_name}' already exists in the given pangenome.") - - if args.annot_file is not None: - # read_annotations(pangenome, args.anno, cpu=args.cpu, pseudo=args.use_pseudo, disable_bar=args.disable_prog_bar) - input_organism, has_sequence = read_anno_file(organism_name=args.organism_name, - filename=args.annot_file, - circular_contigs=[], - pseudo=args.use_pseudo) - if input_organism.number_of_genes() == 0: - raise ValueError("The input organism lacks gene annotations. " - f"Please verify the provided annotation file: {args.annot_file}") - - if not has_sequence: - if args.fasta_file: - retrieve_gene_sequences_from_fasta_file( - input_organism, args.fasta_file) + + + if args.anno is not None: + genome_name_to_annot_path = parse_input_paths_file(args.anno) + check_input_names(pangenome, genome_name_to_annot_path, args.anno) + + organisms, org_2_has_fasta = read_annotation_files(genome_name_to_annot_path, cpu=args.cpu, pseudo=args.use_pseudo, + disable_bar=args.disable_prog_bar) + if not all((has_fasta for has_fasta in org_2_has_fasta.values())): + organisms_with_no_fasta = {org for org, has_fasta in org_2_has_fasta.items() if not has_fasta} + if args.fasta is not None: + get_gene_sequences_from_fasta_files(organisms_with_no_fasta, args.fasta) else: - raise Exception("The gff/gbff provided did not have any sequence information, " - "Thus, we do not have the information we need to continue the projection.") + raise ValueError(f"You provided GFF files for {len(organisms_with_no_fasta)} (out of {len(organisms)}) " + "organisms without associated sequence data, and you did not provide " + "FASTA sequences using the --fasta option. Therefore, it is impossible to project the pangenome onto the input genomes. " + f"The following organisms have no associated sequence data: {', '.join(o.name for o in organisms_with_no_fasta)}") - elif args.fasta_file is not None: + + elif args.fasta is not None: annotate_param_names = ["norna", "kingdom", "allow_overlap", "prodigal_procedure"] - annotate_params = manage_annotate_param( - annotate_param_names, pangenome_params.annotate, args.config) + annotate_params = manage_annotate_param(annotate_param_names, pangenome_params.annotate, args.config) - input_organism = annotate_organism(org_name=args.organism_name, file_name=args.fasta_file, circular_contigs=[], tmpdir=args.tmpdir, - code=args.translation_table, norna=annotate_params.norna, kingdom=annotate_params.kingdom, - allow_overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure) - if input_organism.number_of_genes() == 0: - raise ValueError("No genes have been predicted in the input organism's FASTA file, making projection impossible.") - - else: - raise Exception( - "At least one of --fasta_file or --anno_file must be given") + genome_name_to_fasta_path = parse_input_paths_file(args.fasta) + check_input_names(pangenome, genome_name_to_fasta_path, args.fasta) + organisms = annotate_fasta_files(genome_name_to_fasta_path=genome_name_to_fasta_path, tmpdir=args.tmpdir, cpu=args.cpu, + translation_table=args.translation_table, norna=annotate_params.norna, kingdom=annotate_params.kingdom, + allow_overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure, disable_bar=args.disable_prog_bar ) + + + exit() # Add input organism in pangenome. This is temporary as the pangenome object is not going to be written. pangenome.add_organism(input_organism) @@ -198,6 +192,197 @@ def launch(args: argparse.Namespace): summarize_projection(input_organism, pangenome, input_org_rgps, input_org_spots, input_org_modules, singleton_gene_count ) +def annotate_fasta_files(genome_name_to_fasta_path: Dict[str,dict], tmpdir: str, cpu: int = 1, translation_table: int = 11, + kingdom: str = "bacteria", norna: bool = False, allow_overlap: bool = False, procedure: str = None, + disable_bar: bool = False): + """ + Main function to annotate a pangenome + + :param genome_name_to_annot_path: + :param fasta_list: List of fasta file containing sequences that will be base of pangenome + :param tmpdir: Path to temporary directory + :param cpu: number of CPU cores to use + :param translation_table: Translation table (genetic code) to use. + :param kingdom: Kingdom to which the prokaryota belongs to, to know which models to use for rRNA annotation. + :param norna: Use to avoid annotating RNA features. + :param allow_overlap: Use to not remove genes overlapping with RNA features + :param procedure: prodigal procedure used + :param disable_bar: Disable the progresse bar + """ + + organisms = [] + arguments = [] # Argument given to annotate organism in same order than prototype + for org_name, org_info in genome_name_to_fasta_path.items(): + + arguments.append((org_name, org_info['path'], org_info['circular_contigs'], tmpdir, translation_table, + norna, kingdom, allow_overlap, procedure)) + + logging.getLogger("PPanGGOLiN").info(f"Annotating {len(arguments)} genomes using {cpu} cpus...") + with get_context('fork').Pool(processes=cpu) as p: + for organism in tqdm(p.imap_unordered(launch_annotate_organism, arguments), unit="genome", + total=len(arguments), disable=disable_bar): + + organisms.append(organism) + p.close() + p.join() + + return organisms + + + +def read_annotation_files(genome_name_to_annot_path: Dict[str,dict], cpu: int = 1, pseudo: bool = False, + disable_bar: bool = False): + """ + Read the annotation from GBFF file + + :param pangenome: pangenome object + :param organisms_file: List of GBFF files for each organism + :param cpu: number of CPU cores to use + :param pseudo: allow to read pseudogène + :param disable_bar: Disable the progresse bar + """ + + args = [] + organisms = [] + + # we assume there are gene sequences in the annotation files, + # unless a gff file without fasta is met (which is the only case where sequences can be absent) + org_to_has_fasta_flag = {} + + + for org_name, org_info in genome_name_to_annot_path.items(): + + args.append((org_name, org_info['path'], org_info['circular_contigs'], pseudo)) + + with get_context('fork').Pool(cpu) as p: + for org, has_fasta in tqdm(p.imap_unordered(launch_read_anno, args), unit="file", total=len(args), + disable=disable_bar): + + organisms.append(org) + org_to_has_fasta_flag[org] = has_fasta + + return organisms, org_to_has_fasta_flag + + +# def annotate_input_genomes(): + + + +# if args.annot_file is not None: +# # read_annotations(pangenome, args.anno, cpu=args.cpu, pseudo=args.use_pseudo, disable_bar=args.disable_prog_bar) +# input_organism, has_sequence = read_anno_file(organism_name=args.organism_name, +# filename=args.annot_file, +# circular_contigs=[], +# pseudo=args.use_pseudo) +# if input_organism.number_of_genes() == 0: +# raise ValueError("The input organism lacks gene annotations. " +# f"Please verify the provided annotation file: {args.annot_file}") + +# if not has_sequence: +# if args.fasta_file: +# retrieve_gene_sequences_from_fasta_file( +# input_organism, args.fasta_file) +# else: +# raise Exception("The gff/gbff provided did not have any sequence information, " +# "Thus, we do not have the information we need to continue the projection.") + +# elif args.fasta_file is not None: +# annotate_param_names = ["norna", "kingdom", +# "allow_overlap", "prodigal_procedure"] + +# annotate_params = manage_annotate_param( +# annotate_param_names, pangenome_params.annotate, args.config) + +# input_organism = annotate_organism(org_name=args.organism_name, file_name=args.fasta_file, circular_contigs=[], tmpdir=args.tmpdir, +# code=args.translation_table, norna=annotate_params.norna, kingdom=annotate_params.kingdom, +# allow_overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure) +# if input_organism.number_of_genes() == 0: +# raise ValueError("No genes have been predicted in the input organism's FASTA file, making projection impossible.") + +# else: +# raise Exception( +# "At least one of --fasta_file or --anno_file must be given") + + + + +def get_gene_sequences_from_fasta_files(organisms, fasta_paths_file): + """ + Get gene sequences from fasta path file + + :param organisms: input pangenome + :param fasta_file: list of fasta file + """ + genome_name_to_annot_path = parse_input_paths_file(fasta_paths_file) + + org_names = {org.name for org in organisms} + + if org_names & set(genome_name_to_annot_path) != org_names: + missing = len(org_names - set(genome_name_to_annot_path)) + raise ValueError(f"Not all of your pangenome organisms are present within the provided fasta file: {fasta_paths_file}. " + f"{missing} are missing (out of {len(organisms)}).") + + for org in organisms: + + org_fasta_file = genome_name_to_annot_path[org.name]['path'] + + with read_compressed_or_not(org_fasta_file) as currFastaFile: + org_contig_to_seq, _ = read_fasta(org, currFastaFile) + + for contig in org.contigs: + try: + contig_seq = org_contig_to_seq[contig.name] + except KeyError: + msg = f"Fasta file for organism {org.name} did not have the contig {contig.name} " \ + f"that was read from the annotation file. " + msg += f"The provided contigs in the fasta were : " \ + f"{', '.join([contig for contig in org_contig_to_seq])}." + raise KeyError(msg) + + for gene in contig.genes: + gene.add_sequence(get_dna_sequence(contig_seq, gene)) + + for rna in contig.RNAs: + rna.add_sequence(get_dna_sequence(contig_seq, rna)) + + + +def check_input_names(pangenome, input_names, path_list_file): + duplicated_names = set(input_names) & {org.name for org in pangenome.organisms} + if len(duplicated_names) != 0: + raise NameError(f"{len(duplicated_names)} organism names found in '{path_list_file}' already exists in the given pangenome: {' '.join(duplicated_names)}") + + + +def parse_input_paths_file(path_list_file): + + + logging.getLogger("PPanGGOLiN").info(f"Reading {path_list_file} to process organism files") + genome_name_to_genome_path = {} + + + for line in read_compressed_or_not(path_list_file): + + elements = [el.strip() for el in line.split("\t")] + genome_file_path = Path(elements[1]) + genome_name = elements[0] + putative_circular_contigs = elements[2:] + + if not genome_file_path.exists(): # Check tsv sanity test if it's not one it's the other + genome_file_path_alt = path_list_file.parent.joinpath(genome_file_path) + + if not genome_file_path_alt.exists(): + raise FileNotFoundError(f"The file path '{genome_file_path}' for genome '{genome_name}' specified in '{path_list_file}' does not exist.") + else: + genome_file_path = genome_file_path_alt + + genome_name_to_genome_path[genome_name] = {"path":genome_file_path, "circular_contigs":putative_circular_contigs} + + if len(genome_name_to_genome_path) == 0: + raise Exception(f"There are no genomes in the provided file: {path_list_file} ") + + return genome_name_to_genome_path + def summarize_projection(input_organism:Organism, pangenome:Pangenome, input_org_rgps:Region, @@ -254,15 +439,15 @@ def summarize_projection(input_organism:Organism, pangenome:Pangenome, input_org print(yaml_string) -def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_organism: Organism, output: Path, +def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_organisms: Iter[Organism], output: Path, cpu: int,use_representatives:bool, no_defrag: bool, identity: float, coverage: float, tmpdir: Path, translation_table: int, keep_tmp:bool = False, disable_bar: bool =False): """ - Annotate input genes with pangenome gene families and perform clustering. + Annotate input genes with pangenome gene families by associating them to a cluster. :param pangenome: Pangenome object. - :param input_organism: Input organism object. + :param input_organisms: Iterable of input organism objects. :param output: Output directory for generated files. :param cpu: Number of CPU cores to use. :param no_defrag: Whether to use defragmentation. @@ -277,19 +462,19 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org :return: Number of genes that do not cluster with any of the gene families of the pangenome. """ - seq_fasta_file = output / f"{input_organism.name}.fasta" + for input_organism in input_organisms: + seq_fasta_file = output / f"{input_organism.name}.fasta" - logging.info(f'The input organism has {input_organism.number_of_genes()} genes. Writting them in {seq_fasta_file}') + logging.info(f'The input organism has {input_organism.number_of_genes()} genes. Writting them in {seq_fasta_file}') - with open(seq_fasta_file, "w") as fh_out_faa: - write_gene_sequences_from_annotations( - input_organism.genes, fh_out_faa, disable_bar=True, add="ppanggolin_") - seq_set = {gene.ID if gene.local_identifier == "" else gene.local_identifier for gene in input_organism.genes} + with open(seq_fasta_file, "w") as fh_out_faa: + write_gene_sequences_from_annotations( + input_organism.genes, fh_out_faa, disable_bar=True, add="ppanggolin_") + seq_set = {gene.ID if gene.local_identifier == "" else gene.local_identifier for gene in input_organism.genes} with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: - if use_representatives: _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome, seq_fasta_file, output, new_tmpdir, is_input_seq_nt=True, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table) @@ -762,17 +947,26 @@ def parser_projection(parser: argparse.ArgumentParser): description="One of the following arguments is required :") required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome.h5 file") + + required.add_argument('--fasta', required=False, type=Path, + help="A tab-separated file listing the organism names, and the fasta filepath of its genomic " + "sequence(s) (the fastas can be compressed with gzip). One line per organism.") + required.add_argument('--anno', required=False, type=Path, + help="A tab-separated file listing the organism names, and the gff/gbff filepath of its " + "annotations (the files can be compressed with gzip). One line per organism. " + "If this is provided, those annotations will be used.") + - required.add_argument("-n", '--organism_name', required=False, type=str, - help="Name of the input_organism whose genome is being annotated with the provided pangenome.") + # required.add_argument("-n", '--organism_name', required=False, type=str, + # help="Name of the input_organism whose genome is being annotated with the provided pangenome.") - required.add_argument('--fasta_file', required=False, type=Path, - help="The filepath of the genomic sequence(s) in FASTA format if the genome to annotate. " - "(Fasta file can be compressed with gzip)") + # required.add_argument('--fasta_file', required=False, type=Path, + # help="The filepath of the genomic sequence(s) in FASTA format if the genome to annotate. " + # "(Fasta file can be compressed with gzip)") - required.add_argument('--annot_file', required=False, type=Path, - help="The filepath of the annotations in GFF/GBFF format for the genome to annotate with the provided pangenome. " - "(Annotation file can be compressed with gzip)") + # required.add_argument('--annot_file', required=False, type=Path, + # help="The filepath of the annotations in GFF/GBFF format for the genome to annotate with the provided pangenome. " + # "(Annotation file can be compressed with gzip)") optional = parser.add_argument_group(title="Optional arguments") From 7e460dbc1441418fd1010bee59c0fd3d5a097992 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 14 Sep 2023 18:26:09 +0200 Subject: [PATCH 104/173] fix issue in adding gene in contig introduced in PR #132 --- ppanggolin/annotate/annotate.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 6b45efc3..fb617dd8 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -362,6 +362,8 @@ def get_id_attribute(attributes_dict: dict) -> str: genetic_code=genetic_code) gene.fill_parents(org, contig) gene_counter += 1 + contig.add(gene) + elif "RNA" in fields_gff[gff_type]: rna = RNA(org.name + "_CDS_" + str(rna_counter).zfill(4)) rna.fill_annotations(start=int(fields_gff[gff_start]), stop=int(fields_gff[gff_end]), @@ -369,6 +371,8 @@ def get_id_attribute(attributes_dict: dict) -> str: product=product, local_identifier=gene_id) rna.fill_parents(org, contig) rna_counter += 1 + contig.add_rna(rna) + # GET THE FASTA SEQUENCES OF THE GENES if has_fasta and fasta_string != "": From 71b4c5caa9e71321fdee84433025ad654dddbc5c Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 15 Sep 2023 09:39:29 +0200 Subject: [PATCH 105/173] make seq aln possible with multiple query seq files --- ppanggolin/align/alignOnPang.py | 41 +++--- ppanggolin/annotate/annotate.py | 31 +++-- ppanggolin/context/searchGeneContext.py | 4 +- ppanggolin/formats/writeSequences.py | 2 +- ppanggolin/main.py | 14 +- ppanggolin/projection/projection.py | 166 ++++++++++++++---------- 6 files changed, 153 insertions(+), 105 deletions(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index eb72c40e..9e527450 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -9,7 +9,7 @@ import subprocess import argparse from collections import defaultdict, Counter -from typing import List, Tuple, Set, Dict, IO, Iterator +from typing import List, Tuple, Set, Dict, IO, Iterator, Iterable from pathlib import Path @@ -25,11 +25,11 @@ from ppanggolin.formats.readBinaries import get_non_redundant_gene_sequences_from_file -def create_mmseqs_db(seq_file: Path, tmpdir: Path, basename="sequences") -> Path: +def create_mmseqs_db(seq_files: Iterable[Path], tmpdir: Path, basename="sequences") -> Path: """ - Create a MMseqs2 sequence database with the given fasta file. + Create a MMseqs2 sequence database with the given fasta files. - :param seq_file: Path to the input FASTA file. + :param seq_file: An iterable of path of FASTA files. :param tmpdir: Path to the temporary directory where the database will be created. :param basename: Prefix for the database file (default: "sequences"). @@ -37,7 +37,7 @@ def create_mmseqs_db(seq_file: Path, tmpdir: Path, basename="sequences") -> Path """ with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False, suffix=".DB", prefix=basename) as seqdb: - cmd = ["mmseqs", "createdb", seq_file.as_posix(), seqdb.name, '--dbtype', '0'] + cmd = ["mmseqs", "createdb"] + [seq_file.as_posix() for seq_file in seq_files] + [seqdb.name, '--dbtype', '0'] logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL) @@ -67,7 +67,7 @@ def translate_with_mmseqs(seqdb: Path, translation_table: int, cpu: int, tmpdir: return Path(seqdb_aa.name) -def align_seq_to_pang(target_seq_file:Path , query_seq_file: Path, output: Path, +def align_seq_to_pang(target_seq_file:Path , query_seq_files: Iterable[Path], tmpdir: Path, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, is_query_nt:bool = False, is_target_nt:bool = False, translation_table: int = None) -> Path: @@ -75,8 +75,7 @@ def align_seq_to_pang(target_seq_file:Path , query_seq_file: Path, output: Path, Align fasta sequence to pangenome sequences. :param target_seq_file: File with sequences of pangenome (target) - :param query_seq_file: File with sequences from input file (query) - :param output: Path of the output directory + :param query_seq_files: Iterable of files with sequences from input file (query) :param tmpdir: Temporary directory to align sequences :param cpu: Number of available cpu :param no_defrag: Do not apply defragmentation @@ -89,8 +88,8 @@ def align_seq_to_pang(target_seq_file:Path , query_seq_file: Path, output: Path, :return: Alignement result file """ - target_db = create_mmseqs_db(target_seq_file, tmpdir, basename="target_sequences") - query_db = create_mmseqs_db(query_seq_file, tmpdir, basename="query_sequences") + target_db = create_mmseqs_db([target_seq_file], tmpdir, basename="target_sequences") + query_db = create_mmseqs_db(query_seq_files, tmpdir, basename="query_sequences") if is_target_nt: logging.getLogger().debug(f"Target sequences will be translated by mmseqs with translation table {translation_table}") @@ -147,8 +146,8 @@ def map_input_gene_to_family_all_aln(aln_res: Path, outdir:Path, pangenome: Pang aln_file_clean = outdir / f"alignment_input_seqs_to_all_pangenome_genes.tsv" # write the actual result file input_seq_to_gene_family = outdir / f"input_seqs_to_gene_family.tsv" logging.getLogger().debug(f'Writing alignment file in {aln_file_clean}') - logging.getLogger().debug(f'Writing Gene family id to input seq id file in {input_seq_to_gene_family}') - + logging.getLogger().debug(f'Writing gene family IDs to the input sequence ID file: {input_seq_to_gene_family}') + with open(aln_res, "r") as alnFile, open(aln_file_clean, "w") as aln_outfl, open(input_seq_to_gene_family, "w") as outgene2fam: for line in alnFile: line_splitted = line.split() @@ -259,7 +258,7 @@ def write_all_gene_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", """ if pangenome.status["geneSequences"] == "inFile": - get_non_redundant_gene_sequences_from_file(pangenome.file, file_obj, disable_bar=disable_bar) + get_non_redundant_gene_sequences_from_file(pangenome.file, file_obj, add=add, disable_bar=disable_bar) else: # this should never happen if the pangenome has been properly checked before launching this function. raise Exception("The pangenome does not include gene sequences") @@ -406,7 +405,7 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel logging.getLogger("PPanGGOLiN").info(f"File listing RGP and spots where sequences of interest are located : " f"{output / 'info_input_seq.tsv'}") -def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_file: Path, output: Path, tmpdir: Path, is_input_seq_nt:bool, +def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Iterable[Path], output: Path, tmpdir: Path, is_input_seq_nt:bool, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, translation_table: int = 11, disable_bar:bool = False) -> Tuple[set, str, dict]: @@ -417,7 +416,7 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_file: Path, to appropriate gene families based on alignment results. :param pangenome: Annotated pangenome containing gene families. - :param sequence_file: Path to a FASTA file containing input sequences to align. + :param sequence_files: Iterable of paths of FASTA files containing input sequences to align. :param output: Path to the output directory where alignment results will be stored. :param tmpdir: Temporary directory for intermediate files. :param is_input_seq_nt: Is input sequence file nucleotide sequences. @@ -439,7 +438,7 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_file: Path, logging.getLogger().debug(f'Write gene family sequences in {tmp_pang_file.name}') write_gene_fam_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar) - align_file = align_seq_to_pang(target_seq_file=Path(tmp_pang_file.name), query_seq_file=sequence_file, output=output, + align_file = align_seq_to_pang(target_seq_file=Path(tmp_pang_file.name), query_seq_files=sequence_files, tmpdir=tmpdir, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, is_query_nt=is_input_seq_nt, is_target_nt=False, @@ -449,7 +448,7 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_file: Path, return align_file, seq2pang -def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_file: Path, output: Path, tmpdir: Path, is_input_seq_nt:bool, +def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Iterable[Path], output: Path, tmpdir: Path, is_input_seq_nt:bool, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, translation_table: int = 11, disable_bar:bool = False) -> Tuple[set, str, dict]: """ @@ -459,7 +458,7 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_file: Path, to a gene families based on alignment results. :param pangenome: Annotated pangenome containing genes. - :param sequence_file: Path to a FASTA file containing input sequences to align. + :param sequence_files: Iterable of paths of FASTA files containing input sequences to align. :param output: Path to the output directory where alignment results will be stored. :param tmpdir: Temporary directory for intermediate files. :param is_input_seq_nt: Is input sequence file nucleotide sequences. @@ -481,7 +480,7 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_file: Path, logging.getLogger().debug(f'Write all pangenome gene sequences in {tmp_pang_file.name}') write_all_gene_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar) - align_file = align_seq_to_pang(target_seq_file=Path(tmp_pang_file.name), query_seq_file=sequence_file, output=output, + align_file = align_seq_to_pang(target_seq_file=Path(tmp_pang_file.name), query_seq_files=sequence_files, tmpdir=tmpdir, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, is_query_nt=is_input_seq_nt, is_target_nt=True, @@ -540,11 +539,11 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo if use_representatives: - align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir, is_input_seq_nt=is_nucleotide, + align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, [sequence_file], output, new_tmpdir, is_input_seq_nt=is_nucleotide, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, disable_bar=disable_bar) else: - align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_file=sequence_file, + align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_file=[sequence_file], output=output, tmpdir=new_tmpdir, is_input_seq_nt=is_nucleotide, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, disable_bar=disable_bar) diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index fb617dd8..455eae46 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -9,7 +9,7 @@ from pathlib import Path import tempfile import time -from typing import Tuple +from typing import Tuple, Iterable # installed libraries from tqdm import tqdm @@ -420,9 +420,9 @@ def read_anno_file(organism_name: str, filename: Path, circular_contigs: list, p else: # Fasta type obligatory because unknow raise an error in detect_filetype function raise Exception("Wrong file type provided. This looks like a fasta file. " "You may be able to use --fasta instead.") + - -def chose_gene_identifiers(pangenome) -> bool: +def chose_gene_identifiers(pangenome:Pangenome)-> bool: """ Parses the pangenome genes to decide whether to use local_identifiers or ppanggolin generated gene identifiers. If the local identifiers are unique within the pangenome they are picked, otherwise ppanggolin ones are used. @@ -431,19 +431,34 @@ def chose_gene_identifiers(pangenome) -> bool: :return: Boolean stating True if local identifiers are used, and False otherwise """ + + if local_identifiers_are_unique(pangenome.genes): + + for gene in pangenome.genes: + gene.ID = gene.local_identifier # Erase ppanggolin generated gene ids and replace with local identifiers + gene.local_identifier = "" # this is now useless, setting it to default value + pangenome._mk_gene_getter() # re-build the gene getter + + else: + return False + +def local_identifiers_are_unique(genes: Iterable[Gene]) -> bool: + """ + Check if local_identifiers of genes are uniq in order to decide if they should be used as gene id. + + :param genes: Iterable of gene objects + + :return: Boolean stating True if local identifiers are uniq, and False otherwise + """ gene_id_2_local = {} local_to_gene_id = {} - for gene in pangenome.genes: + for gene in genes: gene_id_2_local[gene.ID] = gene.local_identifier local_to_gene_id[gene.local_identifier] = gene.ID if len(local_to_gene_id) != len(gene_id_2_local): # then, there are non unique local identifiers return False # if we reach this line, local identifiers are unique within the pangenome - for gene in pangenome.genes: - gene.ID = gene.local_identifier # Erase ppanggolin generated gene ids and replace with local identifiers - gene.local_identifier = "" # this is now useless, setting it to default value - pangenome._mk_gene_getter() # re-build the gene getter return True diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index d75cee64..e1403a5f 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -64,11 +64,11 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: if use_representatives: - _, seq2pan = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir, is_input_seq_nt=is_nucleotide, + _, seq2pan = get_input_seq_to_family_with_rep(pangenome, [sequence_file], output, new_tmpdir, is_input_seq_nt=is_nucleotide, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, disable_bar=disable_bar) else: - _, seq2pan = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_file=sequence_file, + _, seq2pan = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_file=[sequence_file], output=output, tmpdir=new_tmpdir, is_input_seq_nt=is_nucleotide, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 1088782b..ef9b0370 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -37,7 +37,7 @@ def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_o :param add: Add prefix to gene ID. :param disable_bar: Disable progress bar. """ - logging.getLogger("PPanGGOLiN").info("Writing all of the CDS sequences...") + logging.getLogger("PPanGGOLiN").info(f"Writing all CDS sequences in {file_obj.name}") for gene in tqdm(genes_to_write, unit="gene", disable=disable_bar): if gene.type == "CDS": gene_id = gene.ID if gene.local_identifier == "" else gene.local_identifier diff --git a/ppanggolin/main.py b/ppanggolin/main.py index eb2a310c..8024355e 100644 --- a/ppanggolin/main.py +++ b/ppanggolin/main.py @@ -145,14 +145,14 @@ def cmd_line() -> argparse.Namespace: "using the --sequences argument, either through the command line or the config file.") - if args.subcommand == "projection" and args.organism_name is None: - parser.error("Please specify the name of the input organism you want to annotate using the provided pangenome. " - "You can use the --organism_name argument either through the command line or the config file.") + # if args.subcommand == "projection" and args.organism_name is None: + # parser.error("Please specify the name of the input organism you want to annotate using the provided pangenome. " + # "You can use the --organism_name argument either through the command line or the config file.") - if args.subcommand == "projection" and args.fasta_file is None and args.annot_file is None: - parser.error("Please provide either a sequence file using the --fasta_file option or an annotation file (GFF/GBFF) " - "using the --annot_file option for the input organism, either through the command line or the config file, " - "to enable annotation with the provided pangenome.") + # if args.subcommand == "projection" and args.fasta_file is None and args.annot_file is None: + # parser.error("Please provide either a sequence file using the --fasta_file option or an annotation file (GFF/GBFF) " + # "using the --annot_file option for the input organism, either through the command line or the config file, " + # "to enable annotation with the provided pangenome.") diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index fd1212e1..3bdc44b3 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -9,7 +9,7 @@ import time from pathlib import Path import tempfile -from typing import Tuple, Set, Dict, Iterator, Optional, List +from typing import Tuple, Set, Dict, Iterator, Optional, List, Iterable from collections import defaultdict import csv @@ -21,11 +21,11 @@ # # local libraries from ppanggolin.annotate.synta import annotate_organism, read_fasta, get_dna_sequence -from ppanggolin.annotate.annotate import read_anno_file, launch_read_anno, launch_annotate_organism +from ppanggolin.annotate.annotate import read_anno_file, launch_read_anno, launch_annotate_organism, local_identifiers_are_unique from ppanggolin.annotate import subparser as annotate_subparser from ppanggolin.pangenome import Pangenome # from ppanggolin.genome import input_organism, Gene, RNA, Contig -from ppanggolin.utils import create_tmpdir, read_compressed_or_not, write_compressed_or_not, restricted_float, mk_outdir, get_config_args, parse_config_file, get_default_args +from ppanggolin.utils import create_tmpdir, read_compressed_or_not, write_compressed_or_not, restricted_float, mk_outdir, get_config_args, parse_config_file, get_default_args, check_input_files from ppanggolin.align.alignOnPang import get_input_seq_to_family_with_rep,get_input_seq_to_family_with_all, project_and_write_partition from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations from ppanggolin.formats.readBinaries import check_pangenome_info @@ -63,6 +63,13 @@ def launch(args: argparse.Namespace): predict_rgp = True project_spots = True + if hasattr(args, "fasta") and args.fasta is not None: + check_input_files(args.fasta, True) + + if hasattr(args, "anno") and args.anno is not None: + check_input_files(args.anno, True) + + pangenome = Pangenome() pangenome.add_file(args.pangenome) @@ -72,18 +79,18 @@ def launch(args: argparse.Namespace): "See the 'partition' subcommands.") if pangenome.status["predictedRGP"] not in ["Computed", "Loaded", "inFile"]: - logging.getLogger().info("RGPs have not been predicted in the provided pangenome. " + logging.getLogger('PPanGGOLiN').info("RGPs have not been predicted in the provided pangenome. " "Projection of RGPs and spots into the provided genome will not be performed.") predict_rgp = False project_spots = False elif pangenome.status["spots"] not in ["Computed", "Loaded", "inFile"]: - logging.getLogger().info("Spots have not been predicted in the provided pangenome. " + logging.getLogger('PPanGGOLiN').info("Spots have not been predicted in the provided pangenome. " "Projection of spots into the provided genome will not be performed.") project_spots = False if pangenome.status["modules"] not in ["Computed", "Loaded", "inFile"]: - logging.getLogger().info("Modules have not been predicted in the provided pangenome. " + logging.getLogger('PPanGGOLiN').info("Modules have not been predicted in the provided pangenome. " "Projection of modules into the provided genome will not be performed.") project_modules = False @@ -103,7 +110,7 @@ def launch(args: argparse.Namespace): need_spots=project_spots) - logging.getLogger().info('Retrieving parameters from the provided pangenome file.') + logging.getLogger('PPanGGOLiN').info('Retrieving parameters from the provided pangenome file.') pangenome_params = argparse.Namespace( **{step: argparse.Namespace(**k_v) for step, k_v in pangenome.parameters.items()}) @@ -114,6 +121,7 @@ def launch(args: argparse.Namespace): organisms, org_2_has_fasta = read_annotation_files(genome_name_to_annot_path, cpu=args.cpu, pseudo=args.use_pseudo, disable_bar=args.disable_prog_bar) + if not all((has_fasta for has_fasta in org_2_has_fasta.values())): organisms_with_no_fasta = {org for org, has_fasta in org_2_has_fasta.items() if not has_fasta} if args.fasta is not None: @@ -124,7 +132,9 @@ def launch(args: argparse.Namespace): "FASTA sequences using the --fasta option. Therefore, it is impossible to project the pangenome onto the input genomes. " f"The following organisms have no associated sequence data: {', '.join(o.name for o in organisms_with_no_fasta)}") - + + + elif args.fasta is not None: annotate_param_names = ["norna", "kingdom", "allow_overlap", "prodigal_procedure"] @@ -137,29 +147,27 @@ def launch(args: argparse.Namespace): translation_table=args.translation_table, norna=annotate_params.norna, kingdom=annotate_params.kingdom, allow_overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure, disable_bar=args.disable_prog_bar ) - - - exit() + # Add input organism in pangenome. This is temporary as the pangenome object is not going to be written. - pangenome.add_organism(input_organism) + # pangenome.add_organism(input_organism) - singleton_gene_count = annotate_input_genes_with_pangenome_families(pangenome, input_organism=input_organism, + singleton_gene_count = annotate_input_genes_with_pangenome_families(pangenome, input_organisms=organisms, output=output_dir, cpu=args.cpu, use_representatives=args.fast, no_defrag=args.no_defrag, identity=args.identity, coverage=args.coverage, tmpdir=args.tmpdir, translation_table=args.translation_table, keep_tmp=args.keep_tmp, disable_bar=args.disable_prog_bar) - + + exit() input_org_rgps, input_org_spots, input_org_modules = None, None, None if predict_rgp: - logging.getLogger().info('Detecting rgp in input genome.') + logging.getLogger('PPanGGOLiN').info('Detecting RGPs in input genomes.') - logging.getLogger().info("Detecting multigenic families...") - multigenics = pangenome.get_multigenics( - pangenome_params.rgp.dup_margin) + logging.getLogger('PPanGGOLiN').debug("Detecting multigenic families...") + multigenics = pangenome.get_multigenics(pangenome_params.rgp.dup_margin) - input_org_rgps = predict_RGP(pangenome, input_organism, persistent_penalty=pangenome_params.rgp.persistent_penalty, variable_gain=pangenome_params.rgp.variable_gain, + input_org_rgps = predict_RGP(pangenome, organisms, persistent_penalty=pangenome_params.rgp.persistent_penalty, variable_gain=pangenome_params.rgp.variable_gain, min_length=pangenome_params.rgp.min_length, min_score=pangenome_params.rgp.min_score, multigenics=multigenics, disable_bar=args.disable_prog_bar) if len(input_org_rgps) == 0: @@ -178,6 +186,7 @@ def launch(args: argparse.Namespace): # then no spot will be found input_org_spots = {} else: + logging.getLogger('PPanGGOLiN').info('Detecting RG in input genomes.') input_org_spots = predict_spots_in_input_organism(initial_spots=list(pangenome.spots), initial_regions=pangenome.regions, input_org_rgps=input_org_rgps, @@ -231,7 +240,7 @@ def annotate_fasta_files(genome_name_to_fasta_path: Dict[str,dict], tmpdir: str, def read_annotation_files(genome_name_to_annot_path: Dict[str,dict], cpu: int = 1, pseudo: bool = False, - disable_bar: bool = False): + disable_bar: bool = False) -> Tuple[List[Organism], Dict[Organism,bool]]: """ Read the annotation from GBFF file @@ -261,6 +270,18 @@ def read_annotation_files(genome_name_to_annot_path: Dict[str,dict], cpu: int = organisms.append(org) org_to_has_fasta_flag[org] = has_fasta + genes = (gene for org in organisms for gene in org.genes) + + if local_identifiers_are_unique(genes): + for gene in genes: + gene.ID = gene.local_identifier # Erase ppanggolin generated gene ids and replace with local identifiers + gene.local_identifier = "" # this is now useless, setting it to default value + + logging.getLogger("PPanGGOLiN").info("Gene identifiers used in the provided annotation files were unique, " + "PPanGGOLiN will use them.") + else: + logging.getLogger("PPanGGOLiN").info("Gene identifiers used in the provided annotation files were not unique, " + "PPanGGOLiN will use self-generated identifiers.") return organisms, org_to_has_fasta_flag @@ -439,7 +460,7 @@ def summarize_projection(input_organism:Organism, pangenome:Pangenome, input_org print(yaml_string) -def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_organisms: Iter[Organism], output: Path, +def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_organisms: Iterable[Organism], output: Path, cpu: int,use_representatives:bool, no_defrag: bool, identity: float, coverage: float, tmpdir: Path, translation_table: int, keep_tmp:bool = False, disable_bar: bool =False): @@ -461,68 +482,81 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org :return: Number of genes that do not cluster with any of the gene families of the pangenome. """ + seq_fasta_files = [] + + logging.getLogger('PPanGGOLiN').info(f'Writting gene sequences of input genomes.') for input_organism in input_organisms: - seq_fasta_file = output / f"{input_organism.name}.fasta" - logging.info(f'The input organism has {input_organism.number_of_genes()} genes. Writting them in {seq_fasta_file}') + seq_outdir = output / input_organism.name + mk_outdir(seq_outdir, force=True) + + seq_fasta_file = seq_outdir / f"{input_organism.name}.fasta" with open(seq_fasta_file, "w") as fh_out_faa: - write_gene_sequences_from_annotations( - input_organism.genes, fh_out_faa, disable_bar=True, add="ppanggolin_") - seq_set = {gene.ID if gene.local_identifier == "" else gene.local_identifier for gene in input_organism.genes} + write_gene_sequences_from_annotations(input_organism.genes, fh_out_faa, disable_bar=True, add=f"ppanggolin_") + seq_fasta_files.append(seq_fasta_file) with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: if use_representatives: - _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome, seq_fasta_file, output, new_tmpdir, is_input_seq_nt=True, + _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome, seq_fasta_files, output, new_tmpdir, is_input_seq_nt=True, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table) else: - _, seqid_to_gene_family = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_file=seq_fasta_file, + _, seqid_to_gene_family = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=seq_fasta_files, output=output, tmpdir=new_tmpdir, is_input_seq_nt=True, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, disable_bar=disable_bar) + for input_organism in input_organisms: + + org_outdir = output / input_organism.name - project_and_write_partition(seqid_to_gene_family, seq_set, output) + seq_set = {gene.ID if gene.local_identifier == "" else gene.local_identifier for gene in input_organism.genes} - lonely_gene = 0 - for gene in input_organism.genes: - gene_id = gene.ID if gene.local_identifier == "" else gene.local_identifier + project_and_write_partition(seqid_to_gene_family, seq_set, org_outdir) + + lonely_genes = set() + for gene in input_organism.genes: + gene_id = gene.ID if gene.local_identifier == "" else gene.local_identifier - try: - gene_family = seqid_to_gene_family[gene_id] - gene_family.add(gene) - except KeyError: - # the seqid is not in the dict so it does not align with any pangenome families - # We consider it as cloud gene try: - # in some case a family exists already and has the same name of the gene id - # So gene id cannot be used - _ = pangenome.get_gene_family(gene_id) + gene_family = seqid_to_gene_family[gene_id] + gene_family.add(gene) except KeyError: - new_gene_family = GeneFamily(pangenome.max_fam_id, gene_id) - - else: - # gene id already exists. - new_name=f"{input_organism.name}_{gene_id}" - logging.getLogger('PPanGGOLiN').warning('The input organism as a specific gene that does not align to any ' - f'pangenome families with the same id ({gene_id}) than an existing gene family in the pangenome. ' - f'The organism name is added to the family name: {new_name}') - new_gene_family = GeneFamily(pangenome.max_fam_id, new_name) - - pangenome.add_gene_family(new_gene_family) - new_gene_family.add(gene) - new_gene_family.partition = "Cloud" - lonely_gene += 1 - - logging.getLogger().info(f"The input organism has {lonely_gene}/{input_organism.number_of_genes()} " - "genes that do not cluster with any of the gene families in the pangenome.") - return lonely_gene - - -def predict_RGP(pangenome: Pangenome, input_organism: Organism, persistent_penalty: int, variable_gain: int, + # the seqid is not in the dict so it does not align with any pangenome families + # We consider it as cloud gene + try: + # in some case a family exists already and has the same name of the gene id + # So gene id cannot be used + _ = pangenome.get_gene_family(gene_id) + except KeyError: + new_gene_family = GeneFamily(pangenome.max_fam_id, gene_id) + + else: + # gene id already exists. + new_name=f"{input_organism.name}_{gene_id}" + logging.getLogger('PPanGGOLiN').warning('The input organism as a specific gene that does not align to any ' + f'pangenome families with the same id ({gene_id}) than an existing gene family in the pangenome. ' + f'The organism name is added to the family name: {new_name}') + new_gene_family = GeneFamily(pangenome.max_fam_id, new_name) + + pangenome.add_gene_family(new_gene_family) + new_gene_family.add(gene) + new_gene_family.partition = "Cloud" + lonely_genes.add(gene) + + logging.getLogger('PPanGGOLiN').info(f"{input_organism.name} has {len(lonely_genes)}/{input_organism.number_of_genes()} " + "specific genes that do not align to any gene of the pangenome.") + # Write specific gene ids in a file + with open(org_outdir / "specific_genes.tsv", "w") as fl: + fl.write('\n'.join((gene.ID if gene.local_identifier == "" else gene.local_identifier for gene in lonely_genes)) + '\n') + + return len(lonely_genes) + + +def predict_RGP(pangenome: Pangenome, input_organisms: Organism, persistent_penalty: int, variable_gain: int, min_length: int, min_score: int, multigenics: float, disable_bar: bool) -> None: """ @@ -540,13 +574,13 @@ def predict_RGP(pangenome: Pangenome, input_organism: Organism, persistent_penal :return: Set of RGPs """ - logging.getLogger().info("Computing Regions of Genomic Plasticity...") + logging.getLogger('PPanGGOLiN').info("Computing Regions of Genomic Plasticity...") name_scheme = naming_scheme(pangenome) rgps = compute_org_rgp(input_organism, multigenics, persistent_penalty, variable_gain, min_length, min_score, naming=name_scheme, disable_bar=disable_bar) - logging.getLogger().info( + logging.getLogger('PPanGGOLiN').info( f"{len(rgps)} RGPs have been predicted in the input genomes.") return rgps @@ -915,10 +949,10 @@ def project_and_write_modules(pangenome: Pangenome, input_organism: Organism, fout.write( f"module_{mod.ID}\t{input_organism.name}\t{completion}\n") - logging.getLogger().info( + logging.getLogger('PPanGGOLiN').info( f"{counter} modules have been projected to the input genomes.") - logging.getLogger().info( + logging.getLogger('PPanGGOLiN').info( f"Projected modules have been written in: '{output_file}'") return modules_in_input_org From 3c5de703b7385a2a4d454e10a2c4bca696c39568 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 15 Sep 2023 10:05:43 +0200 Subject: [PATCH 106/173] predict RGPs in loop on multiple orga --- ppanggolin/RGP/genomicIsland.py | 8 +++--- ppanggolin/projection/projection.py | 43 +++++++++++++++++++---------- 2 files changed, 32 insertions(+), 19 deletions(-) diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index 4f5cfc0e..ad6d726b 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -5,7 +5,7 @@ import logging import argparse from pathlib import Path -from typing import Set +from typing import Set, Iterable # installed libraries from tqdm import tqdm @@ -226,15 +226,15 @@ def compute_org_rgp( organism: Organism, multigenics: set, return org_regions -def naming_scheme(pangenome: Pangenome) -> str: +def naming_scheme(organisms: Iterable[Organism]) -> str: """ Determine the naming scheme for the contigs in the pangenome. - :param pangenome: Pangenome object + :param organisms: Iterable of organims objects :return: Naming scheme for the contigs ("contig" or "organism"). """ contigsids = set() - for org in pangenome.organisms: + for org in organisms: for contig in org.contigs: oldlen = len(contigsids) contigsids.add(contig.name) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 3bdc44b3..b6803321 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -12,6 +12,7 @@ from typing import Tuple, Set, Dict, Iterator, Optional, List, Iterable from collections import defaultdict import csv +from itertools import chain # installed libraries @@ -158,7 +159,7 @@ def launch(args: argparse.Namespace): translation_table=args.translation_table, keep_tmp=args.keep_tmp, disable_bar=args.disable_prog_bar) - exit() + input_org_rgps, input_org_spots, input_org_modules = None, None, None if predict_rgp: @@ -167,11 +168,13 @@ def launch(args: argparse.Namespace): logging.getLogger('PPanGGOLiN').debug("Detecting multigenic families...") multigenics = pangenome.get_multigenics(pangenome_params.rgp.dup_margin) - input_org_rgps = predict_RGP(pangenome, organisms, persistent_penalty=pangenome_params.rgp.persistent_penalty, variable_gain=pangenome_params.rgp.variable_gain, - min_length=pangenome_params.rgp.min_length, min_score=pangenome_params.rgp.min_score, multigenics=multigenics, + input_org_2_rgps = predict_RGP(pangenome, organisms, persistent_penalty=pangenome_params.rgp.persistent_penalty, variable_gain=pangenome_params.rgp.variable_gain, + min_length=pangenome_params.rgp.min_length, min_score=pangenome_params.rgp.min_score, multigenics=multigenics, output_dir=output_dir, disable_bar=args.disable_prog_bar) - if len(input_org_rgps) == 0: - + + exit() + + if None: logging.getLogger('PPanGGOLiN').info("No RGPs have been found in the input organisms. " "As a result, spot prediction and RGP output will be skipped.") @@ -491,7 +494,7 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org seq_outdir = output / input_organism.name mk_outdir(seq_outdir, force=True) - seq_fasta_file = seq_outdir / f"{input_organism.name}.fasta" + seq_fasta_file = seq_outdir / f"cds_sequences.fasta" with open(seq_fasta_file, "w") as fh_out_faa: write_gene_sequences_from_annotations(input_organism.genes, fh_out_faa, disable_bar=True, add=f"ppanggolin_") @@ -558,31 +561,41 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org def predict_RGP(pangenome: Pangenome, input_organisms: Organism, persistent_penalty: int, variable_gain: int, min_length: int, min_score: int, multigenics: float, - disable_bar: bool) -> None: + output_dir:Path, disable_bar: bool) -> Dict[Organism, Set[Region]]: """ - Compute Regions of Genomic Plasticity (RGP) for the given pangenome and input organism. + Compute Regions of Genomic Plasticity (RGP) for the given input organisms. :param pangenome: The pangenome object. - :param input_organism: The input organism for which to compute RGPs. + :param input_organisms: The input organism for which to compute RGPs. :param persistent_penalty: Penalty score to apply to persistent genes. :param variable_gain: Gain score to apply to variable genes. :param min_length: Minimum length (bp) of a region to be considered as RGP. :param min_score: Minimal score required for considering a region as RGP. :param multigenics: multigenic families. + :param output_dir: Output directory where predicted rgps are going to be written. :param disable_bar: Flag to disable the progress bar. - :return: Set of RGPs + :return: Dictionary mapping organism with the set of predicted regions """ logging.getLogger('PPanGGOLiN').info("Computing Regions of Genomic Plasticity...") - name_scheme = naming_scheme(pangenome) - rgps = compute_org_rgp(input_organism, multigenics, persistent_penalty, variable_gain, min_length, + name_scheme = naming_scheme(chain(pangenome.organisms, input_organisms)) + organism_to_rgps = {} + + for input_organism in input_organisms: + rgps = compute_org_rgp(input_organism, multigenics, persistent_penalty, variable_gain, min_length, min_score, naming=name_scheme, disable_bar=disable_bar) - logging.getLogger('PPanGGOLiN').info( - f"{len(rgps)} RGPs have been predicted in the input genomes.") - return rgps + logging.getLogger('PPanGGOLiN').info(f"{len(rgps)} RGPs have been predicted in the input genomes.") + + + org_outdir = output_dir / input_organism.name + + write_predicted_regions(rgps, output=org_outdir, compress=False) + organism_to_rgps[input_organism] = rgps + + return organism_to_rgps def write_predicted_regions(regions: Set[Region], From 67d80214d3da87d342cbdef32c79af96495d41b5 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 15 Sep 2023 11:51:20 +0200 Subject: [PATCH 107/173] make spot prediction compatible with multiple organism --- ppanggolin/projection/projection.py | 94 +++++++++++++++++------------ 1 file changed, 55 insertions(+), 39 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index b6803321..a9d17415 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -171,34 +171,22 @@ def launch(args: argparse.Namespace): input_org_2_rgps = predict_RGP(pangenome, organisms, persistent_penalty=pangenome_params.rgp.persistent_penalty, variable_gain=pangenome_params.rgp.variable_gain, min_length=pangenome_params.rgp.min_length, min_score=pangenome_params.rgp.min_score, multigenics=multigenics, output_dir=output_dir, disable_bar=args.disable_prog_bar) - - exit() - - if None: - logging.getLogger('PPanGGOLiN').info("No RGPs have been found in the input organisms. " - "As a result, spot prediction and RGP output will be skipped.") - - else: - write_predicted_regions( - input_org_rgps, output=output_dir, compress=False) - - if predict_rgp and project_spots: - if len(input_org_rgps) == 0: - # if rgp and spot flag are on but no RGP has been found - # then no spot will be found - input_org_spots = {} - else: - logging.getLogger('PPanGGOLiN').info('Detecting RG in input genomes.') - input_org_spots = predict_spots_in_input_organism(initial_spots=list(pangenome.spots), - initial_regions=pangenome.regions, - input_org_rgps=input_org_rgps, - multigenics=multigenics, output=output_dir, - write_graph_flag=args.spot_graph, graph_formats=args.graph_formats, - overlapping_match=pangenome_params.spot.overlapping_match, - set_size=pangenome_params.spot.set_size, - exact_match=pangenome_params.spot.exact_match_size) + if project_spots: + logging.getLogger('PPanGGOLiN').info('Predicting spot of insertion in input genomes.') + input_org_spots = predict_spots_in_input_organisms(initial_spots=list(pangenome.spots), + initial_regions=pangenome.regions, + input_org_2_rgps=input_org_2_rgps, + multigenics=multigenics, + output=output_dir, + write_graph_flag=args.spot_graph, + graph_formats=args.graph_formats, + overlapping_match=pangenome_params.spot.overlapping_match, + set_size=pangenome_params.spot.set_size, + exact_match=pangenome_params.spot.exact_match_size) + + exit() if project_modules: input_org_modules = project_and_write_modules(pangenome, input_organism, output_dir) @@ -642,7 +630,7 @@ def write_rgp_to_spot_table(rgp_to_spots: Dict[Region, Set[str]], output: Path, :param compress: Whether to compress the file. """ fname = output / filename - logging.getLogger('PPanGGOLiN').info( + logging.getLogger('PPanGGOLiN').debug( f'Writing RGPs to spot table in {fname}') with write_compressed_or_not(fname, compress) as tab: @@ -783,8 +771,8 @@ def check_spots_congruency(graph_spot: nx.Graph, spots: List[Spot]) -> None: graph_spot.nodes[node]["spots"] = {current_spot} -def predict_spots_in_input_organism(initial_spots: List[Spot], initial_regions: List[Region], - input_org_rgps: List[Region], multigenics: Set[GeneFamily], output: str, +def predict_spots_in_input_organisms(initial_spots: List[Spot], initial_regions: List[Region], + input_org_2_rgps: Dict[Organism, Set[Region]], multigenics: Set[GeneFamily], output: str, write_graph_flag: bool = False, graph_formats: List[str] = ['gexf'], overlapping_match: int = 2, set_size: int = 3, exact_match: int = 1) -> Dict: """ @@ -804,7 +792,7 @@ def predict_spots_in_input_organism(initial_spots: List[Spot], initial_regions: :return: A dictionary mapping input organism RGPs to their predicted spots. """ - logging.getLogger("PPanGGOLiN").info(f"Rebuilding spot graph.") + logging.getLogger("PPanGGOLiN").debug(f"Rebuilding original spot graph.") graph_spot = make_spot_graph(rgps=initial_regions, multigenics=multigenics, overlapping_match=overlapping_match, set_size=set_size, exact_match=exact_match) @@ -812,7 +800,36 @@ def predict_spots_in_input_organism(initial_spots: List[Spot], initial_regions: # Check congruency with already computed spot and add spot id in node attributes check_spots_congruency(graph_spot, initial_spots) + + new_spot_id_counter = max((s.ID for s in initial_spots)) + 1 + for input_organism, rgps in input_org_2_rgps.items(): + + if len(rgps) == 0: + logging.getLogger('PPanGGOLiN').info(f"{input_organism.name}: No RGPs have been found. " + "As a result, spot prediction and RGP output will be skipped.") + continue + + outdir_org = output / input_organism.name + # Copy the graph spot, as each input organism are processed independently + graph_spot_cp = graph_spot.copy() + + input_org_spots = predict_spot_in_one_organism(graph_spot_cp, input_org_rgps=rgps, original_nodes=original_nodes, + new_spot_id_counter=new_spot_id_counter, multigenics=multigenics, organism_name=input_organism.name, + output=outdir_org, write_graph_flag=write_graph_flag, graph_formats=graph_formats, + overlapping_match=overlapping_match, set_size=set_size, exact_match=exact_match) + + new_spot_id_counter = max((s.ID for s in input_org_spots)) + 1 + + + return input_org_spots + +def predict_spot_in_one_organism(graph_spot, input_org_rgps, original_nodes, new_spot_id_counter, multigenics: Set[GeneFamily], + organism_name:str, output: Path, + write_graph_flag: bool = False, graph_formats: List[str] = ['gexf'], + overlapping_match: int = 2, set_size: int = 3, exact_match: int = 1): + + # Check which input RGP has a spot lost = 0 used = 0 @@ -829,7 +846,7 @@ def predict_spots_in_input_organism(initial_spots: List[Spot], initial_regions: input_org_node_to_rgps[border_node].add(rgp) if len(input_org_node_to_rgps) == 0: - logging.getLogger("PPanGGOLiN").info(f"No RGPs of the input organism will be associated with any spot of insertion " + logging.getLogger("PPanGGOLiN").info(f"{organism_name}: no RGPs of the input organism will be associated with any spot of insertion " "as they are on a contig border (or have " f"less than {set_size} persistent gene families until the contig border). " "Projection of spots stops here") @@ -838,12 +855,11 @@ def predict_spots_in_input_organism(initial_spots: List[Spot], initial_regions: # remove node that were already in the graph new_nodes = set(input_org_node_to_rgps) - original_nodes - logging.getLogger("PPanGGOLiN").info(f"{lost} RGPs of the input organism won't be associated with any spot of insertion " - "as they are on a contig border (or have " + logging.getLogger("PPanGGOLiN").info(f"{organism_name}: {lost} RGPs were not used as they are on a contig border (or have" f"less than {set_size} persistent gene families until the contig border)") logging.getLogger("PPanGGOLiN").info( - f"{used} RGPs of the input organism will be associated to a spot of insertion") + f"{organism_name}: {used} RGPs of the input organism will be associated to a spot of insertion") # add potential edges from new nodes to the rest of the nodes all_nodes = list(graph_spot.nodes) @@ -860,7 +876,7 @@ def predict_spots_in_input_organism(initial_spots: List[Spot], initial_regions: input_rgp_to_spots = {} new_spots = [] - new_spot_id_counter = max((s.ID for s in initial_spots)) + 1 + # determine spot ids of the new nodes and by extension to their rgps for comp in nx.algorithms.components.connected_components(graph_spot): # in very rare case one cc can have several original spots @@ -883,7 +899,7 @@ def predict_spots_in_input_organism(initial_spots: List[Spot], initial_regions: elif len(spots_of_the_cc) > 1: # more than one spot in the cc - logging.getLogger("PPanGGOLiN").info('Some RGPs of the input organism ' + logging.getLogger("PPanGGOLiN").info(f'{organism_name}: Some RGPs of the input organism ' f"are connected to {len(spots_of_the_cc)} original spots of the pangenome.") input_rgps_of_the_cc = set() @@ -906,6 +922,7 @@ def predict_spots_in_input_organism(initial_spots: List[Spot], initial_regions: {rgp: spots_of_the_cc for rgp in input_rgps_of_the_cc}) if write_graph_flag: + # remove node that would not be writable in graph file for node in graph_spot.nodes: del graph_spot.nodes[node]["spots"] @@ -921,15 +938,14 @@ def predict_spots_in_input_organism(initial_spots: List[Spot], initial_regions: logging.getLogger('PPanGGOLiN').info( - f'{len(new_spots)} new spots have been created for the input genome.') + f'{organism_name}: {len(new_spots)} new spots have been created for the input genome.') if new_spots: summarize_spots(new_spots, output, compress=False, file_name="new_spots_summary.tsv") - + return input_org_spots - def project_and_write_modules(pangenome: Pangenome, input_organism: Organism, output: Path, compress: bool = False): """ From 3443ee39c9da618ccb88e3730b2fe0b4b03ac924 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 15 Sep 2023 11:57:48 +0200 Subject: [PATCH 108/173] project modules in multiple organisms --- ppanggolin/projection/projection.py | 52 ++++++++++++++--------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index a9d17415..409e6fe3 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -186,11 +186,10 @@ def launch(args: argparse.Namespace): set_size=pangenome_params.spot.set_size, exact_match=pangenome_params.spot.exact_match_size) - exit() if project_modules: - input_org_modules = project_and_write_modules(pangenome, input_organism, output_dir) + input_org_modules = project_and_write_modules(pangenome, organisms, output_dir) - summarize_projection(input_organism, pangenome, input_org_rgps, input_org_spots, input_org_modules, singleton_gene_count ) + # summarize_projection(input_organism, pangenome, input_org_rgps, input_org_spots, input_org_modules, singleton_gene_count ) def annotate_fasta_files(genome_name_to_fasta_path: Dict[str,dict], tmpdir: str, cpu: int = 1, translation_table: int = 11, kingdom: str = "bacteria", norna: bool = False, allow_overlap: bool = False, procedure: str = None, @@ -946,44 +945,45 @@ def predict_spot_in_one_organism(graph_spot, input_org_rgps, original_nodes, new return input_org_spots -def project_and_write_modules(pangenome: Pangenome, input_organism: Organism, +def project_and_write_modules(pangenome: Pangenome, input_organisms: Iterable[Organism], output: Path, compress: bool = False): """ Write a tsv file providing association between modules and the input organism :param pangenome: Pangenome object - :param input_organism: the organism that is being annotated + :param input_organisms: iterable of the organisms that is being annotated :param output: Path to output directory :param compress: Compress the file in .gz """ - output_file = output / "modules_in_input_organism.tsv" + for input_organism in input_organisms: + output_file = output / input_organism.name / "modules_in_input_organism.tsv" - input_organism_families = list(input_organism.families) - counter = 0 - modules_in_input_org = [] - with write_compressed_or_not(output_file, compress) as fout: - fout.write("module_id\torganism\tcompletion\n") + input_organism_families = list(input_organism.families) + counter = 0 + modules_in_input_org = [] + with write_compressed_or_not(output_file, compress) as fout: + fout.write("module_id\torganism\tcompletion\n") - for mod in pangenome.modules: - module_in_input_organism = any( - (fam in input_organism_families for fam in mod.families)) + for mod in pangenome.modules: + module_in_input_organism = any( + (fam in input_organism_families for fam in mod.families)) - if module_in_input_organism: - counter += 1 - modules_in_input_org.append(mod) + if module_in_input_organism: + counter += 1 + modules_in_input_org.append(mod) - completion = round( - len(set(input_organism.families) & set(mod.families)) / len(set(mod.families)), 2) - fout.write( - f"module_{mod.ID}\t{input_organism.name}\t{completion}\n") + completion = round( + len(set(input_organism.families) & set(mod.families)) / len(set(mod.families)), 2) + fout.write( + f"module_{mod.ID}\t{input_organism.name}\t{completion}\n") - logging.getLogger('PPanGGOLiN').info( - f"{counter} modules have been projected to the input genomes.") + logging.getLogger('PPanGGOLiN').info( + f"{input_organism.name}: {counter} modules have been projected to the input genomes.") - logging.getLogger('PPanGGOLiN').info( - f"Projected modules have been written in: '{output_file}'") - + logging.getLogger('PPanGGOLiN').info( + f"{input_organism.name}: Projected modules have been written in: '{output_file}'") + return modules_in_input_org def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: From 9f8f65646e00d0ae7ef66d2ff131ed176bb3855d Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 15 Sep 2023 12:12:35 +0200 Subject: [PATCH 109/173] produce summary yaml for each organism --- ppanggolin/projection/projection.py | 44 ++++++++++++++++++----------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 409e6fe3..37ecc32e 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -152,7 +152,7 @@ def launch(args: argparse.Namespace): # Add input organism in pangenome. This is temporary as the pangenome object is not going to be written. # pangenome.add_organism(input_organism) - singleton_gene_count = annotate_input_genes_with_pangenome_families(pangenome, input_organisms=organisms, + input_org_to_lonely_genes_count = annotate_input_genes_with_pangenome_families(pangenome, input_organisms=organisms, output=output_dir, cpu=args.cpu, use_representatives=args.fast, no_defrag=args.no_defrag, identity=args.identity, coverage=args.coverage, tmpdir=args.tmpdir, @@ -160,7 +160,7 @@ def launch(args: argparse.Namespace): disable_bar=args.disable_prog_bar) - input_org_rgps, input_org_spots, input_org_modules = None, None, None + input_org_2_rgps, input_org_to_spots, input_orgs_to_modules = None, None, None if predict_rgp: logging.getLogger('PPanGGOLiN').info('Detecting RGPs in input genomes.') @@ -175,7 +175,7 @@ def launch(args: argparse.Namespace): if project_spots: logging.getLogger('PPanGGOLiN').info('Predicting spot of insertion in input genomes.') - input_org_spots = predict_spots_in_input_organisms(initial_spots=list(pangenome.spots), + input_org_to_spots = predict_spots_in_input_organisms(initial_spots=list(pangenome.spots), initial_regions=pangenome.regions, input_org_2_rgps=input_org_2_rgps, multigenics=multigenics, @@ -187,9 +187,13 @@ def launch(args: argparse.Namespace): exact_match=pangenome_params.spot.exact_match_size) if project_modules: - input_org_modules = project_and_write_modules(pangenome, organisms, output_dir) + input_orgs_to_modules = project_and_write_modules(pangenome, organisms, output_dir) - # summarize_projection(input_organism, pangenome, input_org_rgps, input_org_spots, input_org_modules, singleton_gene_count ) + for organism in organisms: + + summarize_projection(organism, pangenome, input_org_2_rgps[organism], + input_org_to_spots[organism], input_orgs_to_modules[organism], + input_org_to_lonely_genes_count[organism], output_dir) def annotate_fasta_files(genome_name_to_fasta_path: Dict[str,dict], tmpdir: str, cpu: int = 1, translation_table: int = 11, kingdom: str = "bacteria", norna: bool = False, allow_overlap: bool = False, procedure: str = None, @@ -397,7 +401,7 @@ def parse_input_paths_file(path_list_file): def summarize_projection(input_organism:Organism, pangenome:Pangenome, input_org_rgps:Region, - input_org_spots:Spot, input_org_modules:Module, singleton_gene_count:int ): + input_org_spots:Spot, input_org_modules:Module, singleton_gene_count:int, output_dir:Path): """ :param singleton_gene_count: Number of genes that do not cluster with any of the gene families of the pangenome. @@ -443,11 +447,9 @@ def summarize_projection(input_organism:Organism, pangenome:Pangenome, input_org "Modules": module_count } yaml_string = yaml.dump(summary_info, default_flow_style=False, sort_keys=False, indent=4) - - - # summary_str = '\n'.join((f' - {k}: {v}' for k,v in summary_info )) - print('Projection_summary:') - print(yaml_string) + with open(output_dir / input_organism.name / "projection_summary.yaml", 'w') as flout: + flout.write('Projection_summary:') + flout.write(yaml_string) def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_organisms: Iterable[Organism], output: Path, @@ -498,7 +500,7 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org output=output, tmpdir=new_tmpdir, is_input_seq_nt=True, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, disable_bar=disable_bar) - + input_org_to_lonely_genes_count = {} for input_organism in input_organisms: org_outdir = output / input_organism.name @@ -543,7 +545,9 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org with open(org_outdir / "specific_genes.tsv", "w") as fl: fl.write('\n'.join((gene.ID if gene.local_identifier == "" else gene.local_identifier for gene in lonely_genes)) + '\n') - return len(lonely_genes) + input_org_to_lonely_genes_count[input_organism] = len(lonely_genes) + + return input_org_to_lonely_genes_count def predict_RGP(pangenome: Pangenome, input_organisms: Organism, persistent_penalty: int, variable_gain: int, @@ -802,11 +806,14 @@ def predict_spots_in_input_organisms(initial_spots: List[Spot], initial_regions: new_spot_id_counter = max((s.ID for s in initial_spots)) + 1 + input_org_to_spots = {} for input_organism, rgps in input_org_2_rgps.items(): if len(rgps) == 0: logging.getLogger('PPanGGOLiN').info(f"{input_organism.name}: No RGPs have been found. " "As a result, spot prediction and RGP output will be skipped.") + + input_org_to_spots[input_organism] = set() continue outdir_org = output / input_organism.name @@ -819,9 +826,10 @@ def predict_spots_in_input_organisms(initial_spots: List[Spot], initial_regions: overlapping_match=overlapping_match, set_size=set_size, exact_match=exact_match) new_spot_id_counter = max((s.ID for s in input_org_spots)) + 1 - + + input_org_to_spots[input_organism] = input_org_spots - return input_org_spots + return input_org_to_spots def predict_spot_in_one_organism(graph_spot, input_org_rgps, original_nodes, new_spot_id_counter, multigenics: Set[GeneFamily], organism_name:str, output: Path, @@ -955,7 +963,7 @@ def project_and_write_modules(pangenome: Pangenome, input_organisms: Iterable[Or :param output: Path to output directory :param compress: Compress the file in .gz """ - + input_orgs_to_modules = {} for input_organism in input_organisms: output_file = output / input_organism.name / "modules_in_input_organism.tsv" @@ -984,7 +992,9 @@ def project_and_write_modules(pangenome: Pangenome, input_organisms: Iterable[Or logging.getLogger('PPanGGOLiN').info( f"{input_organism.name}: Projected modules have been written in: '{output_file}'") - return modules_in_input_org + input_orgs_to_modules[input_organism] = modules_in_input_org + + return input_orgs_to_modules def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: """ From d4a7316ba6e4f619c725a5fef245a18dc9a08bc1 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 15 Sep 2023 15:21:43 +0200 Subject: [PATCH 110/173] add global summary gathering all organisms --- ppanggolin/projection/projection.py | 57 ++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 9 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 37ecc32e..a952ae52 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -9,7 +9,7 @@ import time from pathlib import Path import tempfile -from typing import Tuple, Set, Dict, Iterator, Optional, List, Iterable +from typing import Tuple, Set, Dict, Iterator, Optional, List, Iterable, Any from collections import defaultdict import csv from itertools import chain @@ -19,6 +19,8 @@ from tqdm import tqdm import networkx as nx import yaml +import pandas as pd + # # local libraries from ppanggolin.annotate.synta import annotate_organism, read_fasta, get_dna_sequence @@ -160,7 +162,7 @@ def launch(args: argparse.Namespace): disable_bar=args.disable_prog_bar) - input_org_2_rgps, input_org_to_spots, input_orgs_to_modules = None, None, None + input_org_2_rgps, input_org_to_spots, input_orgs_to_modules = {}, {}, {} if predict_rgp: logging.getLogger('PPanGGOLiN').info('Detecting RGPs in input genomes.') @@ -189,11 +191,17 @@ def launch(args: argparse.Namespace): if project_modules: input_orgs_to_modules = project_and_write_modules(pangenome, organisms, output_dir) + organism_2_summary = {} for organism in organisms: - - summarize_projection(organism, pangenome, input_org_2_rgps[organism], - input_org_to_spots[organism], input_orgs_to_modules[organism], + # summarize projection for all input organisms + organism_2_summary[organism] = summarize_projection(organism, pangenome, + input_org_2_rgps.get(organism, None), + input_org_to_spots.get(organism, None), + input_orgs_to_modules.get(organism, None), input_org_to_lonely_genes_count[organism], output_dir) + + write_summaries(organism_2_summary, output_dir) + def annotate_fasta_files(genome_name_to_fasta_path: Dict[str,dict], tmpdir: str, cpu: int = 1, translation_table: int = 11, kingdom: str = "bacteria", norna: bool = False, allow_overlap: bool = False, procedure: str = None, @@ -399,6 +407,40 @@ def parse_input_paths_file(path_list_file): return genome_name_to_genome_path +def write_summaries(organism_2_summary: Dict[Organism, Dict[str, Any]], output_dir: Path): + """ + Write summary information to YAML files and create a summary projection in TSV format. + + This function takes a dictionary where keys are input organisms and values are dictionaries containing summary + information. It writes this information to YAML files for each organism and creates a summary projection in TSV format. + + :param organism_2_summary: A dictionary where keys are input organisms and values are dictionaries containing + summary information. + :param output_dir: The directory where the summary files will be written. + """ + flat_summaries = [] + + for input_organism, summary_info in organism_2_summary.items(): + yaml_string = yaml.dump(summary_info, default_flow_style=False, sort_keys=False, indent=4) + + with open(output_dir / input_organism.name / "projection_summary.yaml", 'w') as flout: + flout.write('Projection_summary:') + flout.write(yaml_string) + + flat_summary = {} + for key, val in summary_info.items(): + if type(val) == dict: + for nest_k, nest_v in val.items(): + flat_summary[f"{key} {nest_k}"] = nest_v + else: + flat_summary[key] = val + + flat_summaries.append(flat_summary) + + df_summary = pd.DataFrame(flat_summaries) + + df_summary.to_csv(output_dir / "summary_projection.tsv", sep='\t', index=False) + def summarize_projection(input_organism:Organism, pangenome:Pangenome, input_org_rgps:Region, input_org_spots:Spot, input_org_modules:Module, singleton_gene_count:int, output_dir:Path): @@ -446,10 +488,7 @@ def summarize_projection(input_organism:Organism, pangenome:Pangenome, input_org "New spots": new_spot_count, "Modules": module_count } - yaml_string = yaml.dump(summary_info, default_flow_style=False, sort_keys=False, indent=4) - with open(output_dir / input_organism.name / "projection_summary.yaml", 'w') as flout: - flout.write('Projection_summary:') - flout.write(yaml_string) + return summary_info def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_organisms: Iterable[Organism], output: Path, From 45ba8cbc9010528fbe5ca951b634636e64374dbc Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 15 Sep 2023 17:04:03 +0200 Subject: [PATCH 111/173] enable single genome input --- ppanggolin/main.py | 17 +- ppanggolin/projection/projection.py | 240 ++++++++++++++++++++-------- 2 files changed, 179 insertions(+), 78 deletions(-) diff --git a/ppanggolin/main.py b/ppanggolin/main.py index 8024355e..017cdadd 100644 --- a/ppanggolin/main.py +++ b/ppanggolin/main.py @@ -15,7 +15,6 @@ from ppanggolin.utils import check_input_files, set_verbosity_level, add_common_arguments, manage_cli_and_config_args import ppanggolin.nem.partition import ppanggolin.nem.rarefaction -import ppanggolin.nem.partition import ppanggolin.graph import ppanggolin.annotate import ppanggolin.cluster @@ -143,19 +142,9 @@ def cmd_line() -> argparse.Namespace: if args.subcommand == "align" and args.sequences is None: parser.error("Please provide sequences (nucleotides or amino acids) for alignment with the pangenome gene families " "using the --sequences argument, either through the command line or the config file.") - - - # if args.subcommand == "projection" and args.organism_name is None: - # parser.error("Please specify the name of the input organism you want to annotate using the provided pangenome. " - # "You can use the --organism_name argument either through the command line or the config file.") - - # if args.subcommand == "projection" and args.fasta_file is None and args.annot_file is None: - # parser.error("Please provide either a sequence file using the --fasta_file option or an annotation file (GFF/GBFF) " - # "using the --annot_file option for the input organism, either through the command line or the config file, " - # "to enable annotation with the provided pangenome.") - - - + + if args.subcommand == "projection": + ppanggolin.projection.projection.check_projection_arguments(args, parser) return args diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index a952ae52..1e360b9f 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -50,7 +50,6 @@ class NewSpot(Spot): def __str__(self): return f'new_spot_{str(self.ID)}' - def launch(args: argparse.Namespace): """ Command launcher @@ -66,13 +65,6 @@ def launch(args: argparse.Namespace): predict_rgp = True project_spots = True - if hasattr(args, "fasta") and args.fasta is not None: - check_input_files(args.fasta, True) - - if hasattr(args, "anno") and args.anno is not None: - check_input_files(args.anno, True) - - pangenome = Pangenome() pangenome.add_file(args.pangenome) @@ -117,35 +109,57 @@ def launch(args: argparse.Namespace): pangenome_params = argparse.Namespace( **{step: argparse.Namespace(**k_v) for step, k_v in pangenome.parameters.items()}) - - if args.anno is not None: + + if args.anno: genome_name_to_annot_path = parse_input_paths_file(args.anno) - check_input_names(pangenome, genome_name_to_annot_path, args.anno) + + elif args.single_annot_file: + circular_contigs = args.circular_contigs if args.circular_contigs else [] + genome_name_to_annot_path = {args.organism_name: {"path": args.single_annot_file, + "circular_contigs": circular_contigs} + } + else: + genome_name_to_annot_path = None + + if args.fasta: + genome_name_to_fasta_path = parse_input_paths_file(args.fasta) + + elif args.single_fasta_file: + circular_contigs = args.circular_contigs if args.circular_contigs else [] + genome_name_to_fasta_path = {args.organism_name: {"path": args.single_fasta_file, + "circular_contigs": circular_contigs} + } + else: + genome_name_to_fasta_path = None + + + if genome_name_to_annot_path: + check_input_names(pangenome, genome_name_to_annot_path) organisms, org_2_has_fasta = read_annotation_files(genome_name_to_annot_path, cpu=args.cpu, pseudo=args.use_pseudo, disable_bar=args.disable_prog_bar) if not all((has_fasta for has_fasta in org_2_has_fasta.values())): organisms_with_no_fasta = {org for org, has_fasta in org_2_has_fasta.items() if not has_fasta} - if args.fasta is not None: - get_gene_sequences_from_fasta_files(organisms_with_no_fasta, args.fasta) + if args.fasta: + get_gene_sequences_from_fasta_files(organisms_with_no_fasta, genome_name_to_fasta_path) else: raise ValueError(f"You provided GFF files for {len(organisms_with_no_fasta)} (out of {len(organisms)}) " "organisms without associated sequence data, and you did not provide " - "FASTA sequences using the --fasta option. Therefore, it is impossible to project the pangenome onto the input genomes. " + "FASTA sequences using the --fasta or --single_fasta_file options. Therefore, it is impossible to project the pangenome onto the input genomes. " f"The following organisms have no associated sequence data: {', '.join(o.name for o in organisms_with_no_fasta)}") - elif args.fasta is not None: + elif genome_name_to_fasta_path: annotate_param_names = ["norna", "kingdom", "allow_overlap", "prodigal_procedure"] annotate_params = manage_annotate_param(annotate_param_names, pangenome_params.annotate, args.config) - genome_name_to_fasta_path = parse_input_paths_file(args.fasta) - check_input_names(pangenome, genome_name_to_fasta_path, args.fasta) + + check_input_names(pangenome, genome_name_to_fasta_path) organisms = annotate_fasta_files(genome_name_to_fasta_path=genome_name_to_fasta_path, tmpdir=args.tmpdir, cpu=args.cpu, translation_table=args.translation_table, norna=annotate_params.norna, kingdom=annotate_params.kingdom, allow_overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure, disable_bar=args.disable_prog_bar ) @@ -329,21 +343,20 @@ def read_annotation_files(genome_name_to_annot_path: Dict[str,dict], cpu: int = -def get_gene_sequences_from_fasta_files(organisms, fasta_paths_file): +def get_gene_sequences_from_fasta_files(organisms, genome_name_to_annot_path): """ Get gene sequences from fasta path file :param organisms: input pangenome :param fasta_file: list of fasta file """ - genome_name_to_annot_path = parse_input_paths_file(fasta_paths_file) org_names = {org.name for org in organisms} if org_names & set(genome_name_to_annot_path) != org_names: missing = len(org_names - set(genome_name_to_annot_path)) - raise ValueError(f"Not all of your pangenome organisms are present within the provided fasta file: {fasta_paths_file}. " - f"{missing} are missing (out of {len(organisms)}).") + raise ValueError(f"You did not provided fasta for all the organisms found in annotation file. " + f"{missing} are missing (out of {len(organisms)}). Missing organisms: {','.join(missing)}") for org in organisms: @@ -369,29 +382,43 @@ def get_gene_sequences_from_fasta_files(organisms, fasta_paths_file): rna.add_sequence(get_dna_sequence(contig_seq, rna)) +def check_input_names(pangenome, input_names): + """ + Check if input organism names already exist in the pangenome. -def check_input_names(pangenome, input_names, path_list_file): - duplicated_names = set(input_names) & {org.name for org in pangenome.organisms} - if len(duplicated_names) != 0: - raise NameError(f"{len(duplicated_names)} organism names found in '{path_list_file}' already exists in the given pangenome: {' '.join(duplicated_names)}") - + :param pangenome: The pangenome object. + :param input_names: List of input organism names to check. + :raises NameError: If duplicate organism names are found in the pangenome. + """ + duplicated_names = set(input_names) & {org.name for org in pangenome.organisms} + if len(duplicated_names) != 0: + raise NameError(f"{len(duplicated_names)} provided organism names already exist in the given pangenome: {' '.join(duplicated_names)}") -def parse_input_paths_file(path_list_file): +def parse_input_paths_file(path_list_file: Path) -> Dict[str, Dict[str, List[str]]]: + """ + Parse an input paths file to extract genome information. + This function reads an input paths file, which is in TSV format, and extracts genome information + including file paths and putative circular contigs. + :param path_list_file: The path to the input paths file. + :return: A dictionary where keys are genome names and values are dictionaries containing path information and + putative circular contigs. + :raises FileNotFoundError: If a specified genome file path does not exist. + :raises Exception: If there are no genomes in the provided file. + """ logging.getLogger("PPanGGOLiN").info(f"Reading {path_list_file} to process organism files") genome_name_to_genome_path = {} - for line in read_compressed_or_not(path_list_file): - elements = [el.strip() for el in line.split("\t")] genome_file_path = Path(elements[1]) genome_name = elements[0] putative_circular_contigs = elements[2:] - if not genome_file_path.exists(): # Check tsv sanity test if it's not one it's the other + if not genome_file_path.exists(): + # Check if the file path doesn't exist and try an alternative path. genome_file_path_alt = path_list_file.parent.joinpath(genome_file_path) if not genome_file_path_alt.exists(): @@ -399,7 +426,10 @@ def parse_input_paths_file(path_list_file): else: genome_file_path = genome_file_path_alt - genome_name_to_genome_path[genome_name] = {"path":genome_file_path, "circular_contigs":putative_circular_contigs} + genome_name_to_genome_path[genome_name] = { + "path": genome_file_path, + "circular_contigs": putative_circular_contigs + } if len(genome_name_to_genome_path) == 0: raise Exception(f"There are no genomes in the provided file: {path_list_file} ") @@ -407,6 +437,7 @@ def parse_input_paths_file(path_list_file): return genome_name_to_genome_path + def write_summaries(organism_2_summary: Dict[Organism, Dict[str, Any]], output_dir: Path): """ Write summary information to YAML files and create a summary projection in TSV format. @@ -813,23 +844,31 @@ def check_spots_congruency(graph_spot: nx.Graph, spots: List[Spot]) -> None: graph_spot.nodes[node]["spots"] = {current_spot} -def predict_spots_in_input_organisms(initial_spots: List[Spot], initial_regions: List[Region], - input_org_2_rgps: Dict[Organism, Set[Region]], multigenics: Set[GeneFamily], output: str, - write_graph_flag: bool = False, graph_formats: List[str] = ['gexf'], - overlapping_match: int = 2, set_size: int = 3, exact_match: int = 1) -> Dict: + +def predict_spots_in_input_organisms( + initial_spots: List[Spot], + initial_regions: List[Region], + input_org_2_rgps: Dict[Organism, Set[Region]], + multigenics: Set[GeneFamily], + output: str, + write_graph_flag: bool = False, + graph_formats: List[str] = ['gexf'], + overlapping_match: int = 2, + set_size: int = 3, + exact_match: int = 1 ) -> Dict[Organism, Set[Spot]]: """ Create a spot graph from pangenome RGP and predict spots for input organism RGPs. :param initial_spots: List of original spots in the pangenome. :param initial_regions: List of original regions in the pangenome. - :param input_org_rgps: List of RGPs from the input organism to be associated with spots. + :param input_org_2_rgps: Dictionary mapping input organisms to their RGPs. :param multigenics: Set of pangenome graph multigenic persistent families. :param output: Output directory to save the spot graph. - :param write_graph_flag: If True, writes the spot graph in the specified formats. + :param write_graph_flag: If True, writes the spot graph in the specified formats. Default is False. :param graph_formats: List of graph formats to write (default is ['gexf']). - :param overlapping_match: Number of missing persistent genes allowed when comparing flanking genes. - :param set_size: Number of single copy markers to use as flanking genes for RGP during hotspot computation. - :param exact_match: Number of perfectly matching flanking single copy markers required to associate RGPs. + :param overlapping_match: Number of missing persistent genes allowed when comparing flanking genes. Default is 2. + :param set_size: Number of single copy markers to use as flanking genes for RGP during hotspot computation. Default is 3. + :param exact_match: Number of perfectly matching flanking single copy markers required to associate RGPs. Default is 1. :return: A dictionary mapping input organism RGPs to their predicted spots. """ @@ -870,12 +909,38 @@ def predict_spots_in_input_organisms(initial_spots: List[Spot], initial_regions: return input_org_to_spots -def predict_spot_in_one_organism(graph_spot, input_org_rgps, original_nodes, new_spot_id_counter, multigenics: Set[GeneFamily], - organism_name:str, output: Path, - write_graph_flag: bool = False, graph_formats: List[str] = ['gexf'], - overlapping_match: int = 2, set_size: int = 3, exact_match: int = 1): +def predict_spot_in_one_organism( + graph_spot: nx.Graph, + input_org_rgps: List[Region], + original_nodes: Set[int], + new_spot_id_counter: int, + multigenics: Set[GeneFamily], + organism_name: str, + output: Path, + write_graph_flag: bool = False, + graph_formats: List[str] = ['gexf'], + overlapping_match: int = 2, + set_size: int = 3, + exact_match: int = 1 ) -> Set[Spot]: + """ + Predict spots for input organism RGPs. - + :param graph_spot: The spot graph from the pangenome. + :param input_org_rgps: List of RGPs from the input organism to be associated with spots. + :param original_nodes: Set of original nodes in the spot graph. + :param new_spot_id_counter: Counter for new spot IDs. + :param multigenics: Set of pangenome graph multigenic persistent families. + :param organism_name: Name of the input organism. + :param output: Output directory to save the spot graph. + :param write_graph_flag: If True, writes the spot graph in the specified formats. Default is False. + :param graph_formats: List of graph formats to write (default is ['gexf']). + :param overlapping_match: Number of missing persistent genes allowed when comparing flanking genes. Default is 2. + :param set_size: Number of single copy markers to use as flanking genes for RGP during hotspot computation. Default is 3. + :param exact_match: Number of perfectly matching flanking single copy markers required to associate RGPs. Default is 1. + + Returns: + Set[Spot]: The predicted spots for the input organism RGPs. + """ # Check which input RGP has a spot lost = 0 used = 0 @@ -1035,6 +1100,44 @@ def project_and_write_modules(pangenome: Pangenome, input_organisms: Iterable[Or return input_orgs_to_modules + +def check_projection_arguments(args: argparse.Namespace, parser: argparse.ArgumentParser): + """ + Check the arguments provided for genome projection and raise errors if they are incompatible or missing. + + :param args: An argparse.Namespace object containing parsed command-line arguments. + :param parser: An argparse.ArgumentParser object used to raise errors. + """ + + # Check if we annotate genomes from path files or only a single genome... + if args.fasta or args.anno: + # We are in paths file mode + + incompatible_args = ["single_fasta_file", "single_annot_file", "organism_name", "circular_contigs"] + for single_arg in incompatible_args: + if getattr(args, single_arg) is not None: + parser.error(f"The single genome argument --{single_arg} is incompatible with multiple genomes arguments (--anno and/or --fasta).") + + if args.fasta: + check_input_files(args.fasta, True) + + if args.anno: + check_input_files(args.anno, True) + + elif args.single_fasta_file or args.single_annot_file: + # We are in single file mode + + if args.organism_name is None: + parser.error("Please specify the name of the input organism you want to annotate. " + "You can use the --organism_name argument either through the command line or the config file.") + + else: + parser.error("Please provide either a sequence file using the '--single_fasta_file' or '--fasta' option, " + "or an annotation file using the '--single_annot_file' or '--anno' option. " + "You can specify these either through the command line or the config file.") + + + def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: """ Subparser to launch PPanGGOLiN in Command line @@ -1051,34 +1154,43 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser def parser_projection(parser: argparse.ArgumentParser): """ - Parser for specific argument of annotate command + Parser for specific argument of projection command - :param parser: parser for annotate argument + :param parser: parser for projection argument """ - required = parser.add_argument_group(title="Required arguments", - description="One of the following arguments is required :") + required = parser.add_argument_group(title="Required arguments") + required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome.h5 file") - required.add_argument('--fasta', required=False, type=Path, - help="A tab-separated file listing the organism names, and the fasta filepath of its genomic " - "sequence(s) (the fastas can be compressed with gzip). One line per organism.") - required.add_argument('--anno', required=False, type=Path, - help="A tab-separated file listing the organism names, and the gff/gbff filepath of its " - "annotations (the files can be compressed with gzip). One line per organism. " - "If this is provided, those annotations will be used.") + required_multiple = parser.add_argument_group(title="Multiple genome arguments:", + description="Arguments for annotating multiple genomes with the provided pangenome.") + required_multiple.add_argument('--fasta', required=False, type=Path, + help="A tab-separated file listing the organism names, and the fasta filepath of its genomic " + "sequence(s) (the fastas can be compressed with gzip). One line per organism.") + + required_multiple.add_argument('--anno', required=False, type=Path, + help="A tab-separated file listing the organism names, and the gff/gbff filepath of its " + "annotations (the files can be compressed with gzip). One line per organism. " + "If this is provided, those annotations will be used.") + + required_single = parser.add_argument_group(title="Single genome arguments:", + description="Arguments for annotating a single genome with the provided pangenome.") - # required.add_argument("-n", '--organism_name', required=False, type=str, - # help="Name of the input_organism whose genome is being annotated with the provided pangenome.") + required_single.add_argument("-n", '--organism_name', required=False, type=str, + help="Specify the name of the input organism whose genome you want to annotate with the provided pangenome.") - # required.add_argument('--fasta_file', required=False, type=Path, - # help="The filepath of the genomic sequence(s) in FASTA format if the genome to annotate. " - # "(Fasta file can be compressed with gzip)") + required_single.add_argument('--single_fasta_file', required=False, type=Path, + help="Provide the file path to the genomic sequence(s) in FASTA format for the genome you wish to annotate. " + "(Fasta files can be compressed using gzip)") - # required.add_argument('--annot_file', required=False, type=Path, - # help="The filepath of the annotations in GFF/GBFF format for the genome to annotate with the provided pangenome. " - # "(Annotation file can be compressed with gzip)") + required_single.add_argument('--single_annot_file', required=False, type=Path, + help="Provide the file path to the annotations in GFF/GBFF format for the genome you want to annotate. " + "(Annotation files can be compressed using gzip)") + + required_single.add_argument('--circular_contigs', nargs="+", required=False, type=tuple, + help="Contigs of the input genome to consider as circular.") optional = parser.add_argument_group(title="Optional arguments") From ee294d37e3bc1b739fcbcc1066259a87003f71f9 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 15 Sep 2023 18:29:28 +0200 Subject: [PATCH 112/173] adjust args in help --- ppanggolin/projection/projection.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 1e360b9f..d40ac84b 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -161,7 +161,7 @@ def launch(args: argparse.Namespace): check_input_names(pangenome, genome_name_to_fasta_path) organisms = annotate_fasta_files(genome_name_to_fasta_path=genome_name_to_fasta_path, tmpdir=args.tmpdir, cpu=args.cpu, - translation_table=args.translation_table, norna=annotate_params.norna, kingdom=annotate_params.kingdom, + translation_table=pangenome_params.cluster.translation_table, norna=annotate_params.norna, kingdom=annotate_params.kingdom, allow_overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure, disable_bar=args.disable_prog_bar ) @@ -172,7 +172,8 @@ def launch(args: argparse.Namespace): output=output_dir, cpu=args.cpu, use_representatives=args.fast, no_defrag=args.no_defrag, identity=args.identity, coverage=args.coverage, tmpdir=args.tmpdir, - translation_table=args.translation_table, keep_tmp=args.keep_tmp, + translation_table=pangenome_params.cluster.translation_table, + keep_tmp=args.keep_tmp, disable_bar=args.disable_prog_bar) @@ -778,6 +779,9 @@ def manage_annotate_param(annotate_param_names: List[str], pangenome_args: argpa annotate_params = argparse.Namespace() # Collecting annotate parameters from different sources + # if they are found in pangenome param they are used + # elif they are found in config they are used + # else use the default value. for annotate_arg in annotate_param_names: if hasattr(pangenome_args, annotate_arg): param_val = getattr(pangenome_args, annotate_arg) @@ -1163,7 +1167,7 @@ def parser_projection(parser: argparse.ArgumentParser): required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome.h5 file") - required_multiple = parser.add_argument_group(title="Multiple genome arguments:", + required_multiple = parser.add_argument_group(title="Multiple genome arguments", description="Arguments for annotating multiple genomes with the provided pangenome.") required_multiple.add_argument('--fasta', required=False, type=Path, @@ -1173,9 +1177,9 @@ def parser_projection(parser: argparse.ArgumentParser): required_multiple.add_argument('--anno', required=False, type=Path, help="A tab-separated file listing the organism names, and the gff/gbff filepath of its " "annotations (the files can be compressed with gzip). One line per organism. " - "If this is provided, those annotations will be used.") + "If provided, those annotations will be used.") - required_single = parser.add_argument_group(title="Single genome arguments:", + required_single = parser.add_argument_group(title="Single genome arguments", description="Arguments for annotating a single genome with the provided pangenome.") required_single.add_argument("-n", '--organism_name', required=False, type=str, @@ -1200,7 +1204,7 @@ def parser_projection(parser: argparse.ArgumentParser): help="Output directory") optional.add_argument('--no_defrag', required=False, action="store_true", - help="DO NOT Realign gene families to link fragments with" + help="DO NOT Realign gene families to link fragments with " "their non-fragmented gene family. (default: False)") optional.add_argument("--fast", required=False, action="store_true", @@ -1213,9 +1217,6 @@ def parser_projection(parser: argparse.ArgumentParser): optional.add_argument('--coverage', required=False, type=restricted_float, default=0.8, help="min coverage percentage threshold") - optional.add_argument("--translation_table", required=False, default=11, type=int, - help="Translation table (genetic code) to use.") - optional.add_argument("--use_pseudo", required=False, action="store_true", help="In the context of provided annotation, use this option to read pseudogenes. " "(Default behavior is to ignore them)") From 2c563dc7417f04b0454566900c40e9f0a5d5a540 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 15 Sep 2023 18:42:13 +0200 Subject: [PATCH 113/173] fix changing argument --- ppanggolin/RGP/genomicIsland.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index ad6d726b..96cf54f6 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -282,7 +282,7 @@ def predict_rgp(pangenome: Pangenome, persistent_penalty: int = 3, variable_gain logging.getLogger("PPanGGOLiN").info("Detecting multigenic families...") multigenics = pangenome.get_multigenics(dup_margin) logging.getLogger("PPanGGOLiN").info("Compute Regions of Genomic Plasticity ...") - name_scheme = naming_scheme(pangenome) + name_scheme = naming_scheme(pangenome.organisms) for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genomes", disable=disable_bar): for region in compute_org_rgp(org, multigenics, persistent_penalty, variable_gain, min_length, min_score, naming=name_scheme): From c86ca78fc278a6b283116ff9cf23d150d502a499 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 15 Sep 2023 18:48:40 +0200 Subject: [PATCH 114/173] fix forgotten return --- ppanggolin/annotate/annotate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 455eae46..b5d62b37 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -438,6 +438,7 @@ def chose_gene_identifiers(pangenome:Pangenome)-> bool: gene.ID = gene.local_identifier # Erase ppanggolin generated gene ids and replace with local identifiers gene.local_identifier = "" # this is now useless, setting it to default value pangenome._mk_gene_getter() # re-build the gene getter + return True else: return False From 56b795a07ab7b73747779299a91b8b346a6f90e8 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 15 Sep 2023 19:12:23 +0200 Subject: [PATCH 115/173] adjust logging level and fix minot issues --- ppanggolin/projection/projection.py | 74 +++++++---------------------- 1 file changed, 16 insertions(+), 58 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index d40ac84b..cfb23de2 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -161,7 +161,7 @@ def launch(args: argparse.Namespace): check_input_names(pangenome, genome_name_to_fasta_path) organisms = annotate_fasta_files(genome_name_to_fasta_path=genome_name_to_fasta_path, tmpdir=args.tmpdir, cpu=args.cpu, - translation_table=pangenome_params.cluster.translation_table, norna=annotate_params.norna, kingdom=annotate_params.kingdom, + translation_table=int(pangenome_params.cluster.translation_table), norna=annotate_params.norna, kingdom=annotate_params.kingdom, allow_overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure, disable_bar=args.disable_prog_bar ) @@ -169,13 +169,13 @@ def launch(args: argparse.Namespace): # pangenome.add_organism(input_organism) input_org_to_lonely_genes_count = annotate_input_genes_with_pangenome_families(pangenome, input_organisms=organisms, - output=output_dir, cpu=args.cpu, use_representatives=args.fast, - no_defrag=args.no_defrag, identity=args.identity, - coverage=args.coverage, tmpdir=args.tmpdir, - translation_table=pangenome_params.cluster.translation_table, - keep_tmp=args.keep_tmp, - disable_bar=args.disable_prog_bar) - + output=output_dir, cpu=args.cpu, use_representatives=args.fast, + no_defrag=args.no_defrag, identity=args.identity, + coverage=args.coverage, tmpdir=args.tmpdir, + translation_table=int(pangenome_params.cluster.translation_table), + keep_tmp=args.keep_tmp, + disable_bar=args.disable_prog_bar) + input_org_2_rgps, input_org_to_spots, input_orgs_to_modules = {}, {}, {} @@ -302,48 +302,6 @@ def read_annotation_files(genome_name_to_annot_path: Dict[str,dict], cpu: int = return organisms, org_to_has_fasta_flag -# def annotate_input_genomes(): - - - -# if args.annot_file is not None: -# # read_annotations(pangenome, args.anno, cpu=args.cpu, pseudo=args.use_pseudo, disable_bar=args.disable_prog_bar) -# input_organism, has_sequence = read_anno_file(organism_name=args.organism_name, -# filename=args.annot_file, -# circular_contigs=[], -# pseudo=args.use_pseudo) -# if input_organism.number_of_genes() == 0: -# raise ValueError("The input organism lacks gene annotations. " -# f"Please verify the provided annotation file: {args.annot_file}") - -# if not has_sequence: -# if args.fasta_file: -# retrieve_gene_sequences_from_fasta_file( -# input_organism, args.fasta_file) -# else: -# raise Exception("The gff/gbff provided did not have any sequence information, " -# "Thus, we do not have the information we need to continue the projection.") - -# elif args.fasta_file is not None: -# annotate_param_names = ["norna", "kingdom", -# "allow_overlap", "prodigal_procedure"] - -# annotate_params = manage_annotate_param( -# annotate_param_names, pangenome_params.annotate, args.config) - -# input_organism = annotate_organism(org_name=args.organism_name, file_name=args.fasta_file, circular_contigs=[], tmpdir=args.tmpdir, -# code=args.translation_table, norna=annotate_params.norna, kingdom=annotate_params.kingdom, -# allow_overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure) -# if input_organism.number_of_genes() == 0: -# raise ValueError("No genes have been predicted in the input organism's FASTA file, making projection impossible.") - -# else: -# raise Exception( -# "At least one of --fasta_file or --anno_file must be given") - - - - def get_gene_sequences_from_fasta_files(organisms, genome_name_to_annot_path): """ Get gene sequences from fasta path file @@ -892,7 +850,7 @@ def predict_spots_in_input_organisms( for input_organism, rgps in input_org_2_rgps.items(): if len(rgps) == 0: - logging.getLogger('PPanGGOLiN').info(f"{input_organism.name}: No RGPs have been found. " + logging.getLogger('PPanGGOLiN').debug(f"{input_organism.name}: No RGPs have been found. " "As a result, spot prediction and RGP output will be skipped.") input_org_to_spots[input_organism] = set() @@ -961,7 +919,7 @@ def predict_spot_in_one_organism( input_org_node_to_rgps[border_node].add(rgp) if len(input_org_node_to_rgps) == 0: - logging.getLogger("PPanGGOLiN").info(f"{organism_name}: no RGPs of the input organism will be associated with any spot of insertion " + logging.getLogger("PPanGGOLiN").debug(f"{organism_name}: no RGPs of the input organism will be associated with any spot of insertion " "as they are on a contig border (or have " f"less than {set_size} persistent gene families until the contig border). " "Projection of spots stops here") @@ -970,10 +928,10 @@ def predict_spot_in_one_organism( # remove node that were already in the graph new_nodes = set(input_org_node_to_rgps) - original_nodes - logging.getLogger("PPanGGOLiN").info(f"{organism_name}: {lost} RGPs were not used as they are on a contig border (or have" + logging.getLogger("PPanGGOLiN").debug(f"{organism_name}: {lost} RGPs were not used as they are on a contig border (or have" f"less than {set_size} persistent gene families until the contig border)") - logging.getLogger("PPanGGOLiN").info( + logging.getLogger("PPanGGOLiN").debug( f"{organism_name}: {used} RGPs of the input organism will be associated to a spot of insertion") # add potential edges from new nodes to the rest of the nodes @@ -1014,7 +972,7 @@ def predict_spot_in_one_organism( elif len(spots_of_the_cc) > 1: # more than one spot in the cc - logging.getLogger("PPanGGOLiN").info(f'{organism_name}: Some RGPs of the input organism ' + logging.getLogger("PPanGGOLiN").debug(f'{organism_name}: Some RGPs of the input organism ' f"are connected to {len(spots_of_the_cc)} original spots of the pangenome.") input_rgps_of_the_cc = set() @@ -1052,7 +1010,7 @@ def predict_spot_in_one_organism( new_spots = {spot for spot in input_org_spots if type(spot) == NewSpot} - logging.getLogger('PPanGGOLiN').info( + logging.getLogger('PPanGGOLiN').debug( f'{organism_name}: {len(new_spots)} new spots have been created for the input genome.') if new_spots: @@ -1094,10 +1052,10 @@ def project_and_write_modules(pangenome: Pangenome, input_organisms: Iterable[Or fout.write( f"module_{mod.ID}\t{input_organism.name}\t{completion}\n") - logging.getLogger('PPanGGOLiN').info( + logging.getLogger('PPanGGOLiN').debug( f"{input_organism.name}: {counter} modules have been projected to the input genomes.") - logging.getLogger('PPanGGOLiN').info( + logging.getLogger('PPanGGOLiN').debug( f"{input_organism.name}: Projected modules have been written in: '{output_file}'") input_orgs_to_modules[input_organism] = modules_in_input_org From d025e62ea05b8a40f4c867d3a5c49e8a7297393b Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 15 Sep 2023 19:13:55 +0200 Subject: [PATCH 116/173] rm commented line --- ppanggolin/projection/projection.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index cfb23de2..1e3d9fcb 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -164,9 +164,6 @@ def launch(args: argparse.Namespace): translation_table=int(pangenome_params.cluster.translation_table), norna=annotate_params.norna, kingdom=annotate_params.kingdom, allow_overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure, disable_bar=args.disable_prog_bar ) - - # Add input organism in pangenome. This is temporary as the pangenome object is not going to be written. - # pangenome.add_organism(input_organism) input_org_to_lonely_genes_count = annotate_input_genes_with_pangenome_families(pangenome, input_organisms=organisms, output=output_dir, cpu=args.cpu, use_representatives=args.fast, From b870836490124c37d3756782bad11da5dbec182a Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 15 Sep 2023 19:42:42 +0200 Subject: [PATCH 117/173] update test cmds --- .github/workflows/main.yml | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2b821884..df85e287 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -131,10 +131,17 @@ jobs: shell: bash -l {0} run: | cd testingDataset + head organisms.gbff.list | sed 's/^/input_org_/g' > organisms.gbff.head.list ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_gbff \ - --organism_name trotro --annot GBFF/GCF_000026905.1_ASM2690v1_genomic.gbff.gz --spot_graph + --anno organisms.gbff.head.list --fast + + ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_fasta \ - --organism_name trotro --fasta_file FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \ - --spot_graph --graph_formats graphml --fast --keep_tmp + --organism_name chlam_A --single_fasta_file FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \ + --spot_graph --graph_formats graphml --fast --keep_tmp -f + + + + + - \ No newline at end of file From d4744c57cc94431746e09c6d20c47841abd5481b Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Sep 2023 14:08:22 +0200 Subject: [PATCH 118/173] make input arg flexible accepting tsv or single fasta/gff/gbff --- .github/workflows/main.yml | 10 +- ppanggolin/main.py | 6 +- ppanggolin/projection/projection.py | 157 +++++++++++++++++----------- ppanggolin/utils.py | 4 +- 4 files changed, 108 insertions(+), 69 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index df85e287..3f07c28b 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -132,15 +132,15 @@ jobs: run: | cd testingDataset head organisms.gbff.list | sed 's/^/input_org_/g' > organisms.gbff.head.list - ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_gbff \ - --anno organisms.gbff.head.list --fast + ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_lisy_of_gbff \ + --anno organisms.gbff.head.list - ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_fasta \ - --organism_name chlam_A --single_fasta_file FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \ + ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_single_fasta \ + --organism_name chlam_A --fasta FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \ --spot_graph --graph_formats graphml --fast --keep_tmp -f - +Z diff --git a/ppanggolin/main.py b/ppanggolin/main.py index 017cdadd..4747bc7d 100644 --- a/ppanggolin/main.py +++ b/ppanggolin/main.py @@ -144,8 +144,10 @@ def cmd_line() -> argparse.Namespace: "using the --sequences argument, either through the command line or the config file.") if args.subcommand == "projection": - ppanggolin.projection.projection.check_projection_arguments(args, parser) - + # check argument correctness and determine input mode (single or multiple files) and add it to args. + input_mode = ppanggolin.projection.projection.check_projection_arguments(args, parser) + setattr(args, "input_mode", input_mode) + return args diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 1e3d9fcb..7fb39a32 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -28,7 +28,7 @@ from ppanggolin.annotate import subparser as annotate_subparser from ppanggolin.pangenome import Pangenome # from ppanggolin.genome import input_organism, Gene, RNA, Contig -from ppanggolin.utils import create_tmpdir, read_compressed_or_not, write_compressed_or_not, restricted_float, mk_outdir, get_config_args, parse_config_file, get_default_args, check_input_files +from ppanggolin.utils import detect_filetype, create_tmpdir, read_compressed_or_not, write_compressed_or_not, restricted_float, mk_outdir, get_config_args, parse_config_file, get_default_args, check_input_files from ppanggolin.align.alignOnPang import get_input_seq_to_family_with_rep,get_input_seq_to_family_with_all, project_and_write_partition from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations from ppanggolin.formats.readBinaries import check_pangenome_info @@ -110,29 +110,26 @@ def launch(args: argparse.Namespace): **{step: argparse.Namespace(**k_v) for step, k_v in pangenome.parameters.items()}) - if args.anno: - genome_name_to_annot_path = parse_input_paths_file(args.anno) + genome_name_to_fasta_path, genome_name_to_annot_path = None, None - elif args.single_annot_file: - circular_contigs = args.circular_contigs if args.circular_contigs else [] - genome_name_to_annot_path = {args.organism_name: {"path": args.single_annot_file, - "circular_contigs": circular_contigs} - } - else: - genome_name_to_annot_path = None - - if args.fasta: - genome_name_to_fasta_path = parse_input_paths_file(args.fasta) + if args.input_mode == "multiple": + if args.anno: + genome_name_to_annot_path = parse_input_paths_file(args.anno) - elif args.single_fasta_file: - circular_contigs = args.circular_contigs if args.circular_contigs else [] - genome_name_to_fasta_path = {args.organism_name: {"path": args.single_fasta_file, - "circular_contigs": circular_contigs} - } - else: - genome_name_to_fasta_path = None + if args.fasta: + genome_name_to_fasta_path = parse_input_paths_file(args.fasta) + else: # args.input_mode == "single: + circular_contigs = args.circular_contigs if args.circular_contigs else [] + if args.anno: + genome_name_to_annot_path = {args.organism_name: {"path": args.annot, + "circular_contigs": circular_contigs}} + + if args.fasta: + genome_name_to_fasta_path = {args.organism_name: {"path": args.fasta, + "circular_contigs": circular_contigs}} + if genome_name_to_annot_path: check_input_names(pangenome, genome_name_to_annot_path) @@ -149,9 +146,6 @@ def launch(args: argparse.Namespace): "FASTA sequences using the --fasta or --single_fasta_file options. Therefore, it is impossible to project the pangenome onto the input genomes. " f"The following organisms have no associated sequence data: {', '.join(o.name for o in organisms_with_no_fasta)}") - - - elif genome_name_to_fasta_path: annotate_param_names = ["norna", "kingdom", "allow_overlap", "prodigal_procedure"] @@ -1060,50 +1054,101 @@ def project_and_write_modules(pangenome: Pangenome, input_organisms: Iterable[Or return input_orgs_to_modules -def check_projection_arguments(args: argparse.Namespace, parser: argparse.ArgumentParser): +def determine_input_mode(input_file: Path, expected_types: list[str], parser: argparse.ArgumentParser) -> str: + """ + Determine the input mode based on the provided input file and expected file types. + + :param input_file: A Path object representing the input file. + :param expected_types: A list of expected file types (e.g., ['fasta', 'gff', 'gbff', 'tsv']). + + :return: A string indicating the input mode ('single' or 'multiple'). + """ + if not input_file.exists(): + parser.error(f"The provided file {input_file} does not exist.") + + try: + filetype = detect_filetype(input_file) + except Exception: + parser.error("Based on its content, the provided file is not recognized as a valid input file. Please ensure it is in one of the supported formats (FASTA, GFF/GBFF, or TSV).") + + if filetype == "tsv": + logging.getLogger('PPanGGOLiN').debug(f"The provided file ({input_file}) is detected as a TSV file.") + mode = "multiple" + elif filetype in expected_types: + logging.getLogger('PPanGGOLiN').debug(f"The provided file ({input_file}) is detected as a single {'/'.join(expected_types)} file.") + mode = "single" + else: + logging.getLogger('PPanGGOLiN').error(f"The provided file {input_file} is not recognized as a valid {'/'.join(expected_types)} file or a TSV file listing names and {'/'.join(expected_types)} files of genomes to annotate.") + parser.error(f"The provided file {input_file} is not recognized as a valid {'/'.join(expected_types)} file or a TSV file listing names and files of genomes to annotate.") + + return mode + + +def check_projection_arguments(args: argparse.Namespace, parser: argparse.ArgumentParser ) -> str: """ Check the arguments provided for genome projection and raise errors if they are incompatible or missing. :param args: An argparse.Namespace object containing parsed command-line arguments. - :param parser: An argparse.ArgumentParser object used to raise errors. + :param parser : parser of the command + :return: A string indicating the input mode ('single' or 'multiple'). """ # Check if we annotate genomes from path files or only a single genome... - if args.fasta or args.anno: + if not args.anno and not args.fasta: + parser.error("Please provide either a FASTA file or a tab-separated file listing sequence files using the '--fasta' option, " + "or an annotation file or a tab-separated file listing annotation files using the '--anno' option. " + "You can specify these either through the command line or the configuration file.") + + mode_from_fasta, mode_from_anno = None, None + if args.fasta: + mode_from_fasta = determine_input_mode(args.fasta, ['fasta'], parser) + input_mode = mode_from_fasta + + if args.anno: + mode_from_anno = determine_input_mode(args.anno, ['gff', "gbff"], parser) + input_mode = mode_from_anno + + logging.getLogger('PPanGGOLiN').debug("") + + if mode_from_fasta and mode_from_anno and mode_from_fasta != mode_from_anno: + single_input, multiple_input = ("fasta", "anno") if mode_from_fasta == "single" else ("anno", "fasta") + + parser.error(f"You've provided both a single annotation/fasta file using the '--{single_input}' option and a list of files using " + f"the '--{multiple_input}' option. Please choose either a single file or a tab-separated file listing genome files, but not both.") + + + if input_mode == "multiple": # We are in paths file mode - incompatible_args = ["single_fasta_file", "single_annot_file", "organism_name", "circular_contigs"] + incompatible_args = ["organism_name", "circular_contigs"] for single_arg in incompatible_args: if getattr(args, single_arg) is not None: - parser.error(f"The single genome argument --{single_arg} is incompatible with multiple genomes arguments (--anno and/or --fasta).") - + parser.error("You provided a TSV file listing the files of genomes you wish to annotate. " + f"Therefore, the single genome argument '--{single_arg}' is incompatible with this multiple genomes file.") + if args.fasta: check_input_files(args.fasta, True) if args.anno: check_input_files(args.anno, True) - elif args.single_fasta_file or args.single_annot_file: + elif input_mode == "single": # We are in single file mode if args.organism_name is None: - parser.error("Please specify the name of the input organism you want to annotate. " + parser.error("You directly provided a single FASTA/GBFF/GFF file. Please specify the name of the input organism you want to annotate. " "You can use the --organism_name argument either through the command line or the config file.") - - else: - parser.error("Please provide either a sequence file using the '--single_fasta_file' or '--fasta' option, " - "or an annotation file using the '--single_annot_file' or '--anno' option. " - "You can specify these either through the command line or the config file.") - + + return input_mode def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: """ Subparser to launch PPanGGOLiN in Command line - :param sub_parser : sub_parser for align command + :param sub_parser : sub_parser for projection command - :return : parser arguments for align command + :return : parser arguments for projection command """ parser = sub_parser.add_parser( "projection", formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -1121,35 +1166,25 @@ def parser_projection(parser: argparse.ArgumentParser): required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome.h5 file") - - required_multiple = parser.add_argument_group(title="Multiple genome arguments", - description="Arguments for annotating multiple genomes with the provided pangenome.") - required_multiple.add_argument('--fasta', required=False, type=Path, - help="A tab-separated file listing the organism names, and the fasta filepath of its genomic " - "sequence(s) (the fastas can be compressed with gzip). One line per organism.") + required.add_argument('--fasta', required=False, type=Path, + help="Specify a FASTA file containing the genomic sequences of the organism(s) you wish to annotate, " + "or provide a tab-separated file listing organism names alongside their respective FASTA filepaths, with one line per organism.") - required_multiple.add_argument('--anno', required=False, type=Path, - help="A tab-separated file listing the organism names, and the gff/gbff filepath of its " - "annotations (the files can be compressed with gzip). One line per organism. " - "If provided, those annotations will be used.") + required.add_argument('--anno', required=False, type=Path, + help="Specify an annotation file in GFF/GBFF format for the genome you wish to annotate. " + "Alternatively, you can provide a tab-separated file listing organism names alongside their respective annotation filepaths, " + "with one line per organism. If both an annotation file and a FASTA file are provided, the annotation file will take precedence.") - required_single = parser.add_argument_group(title="Single genome arguments", - description="Arguments for annotating a single genome with the provided pangenome.") + required_single = parser.add_argument_group(title="Single Genome Arguments", + description="Use these options when providing a single FASTA or annotation file:") required_single.add_argument("-n", '--organism_name', required=False, type=str, - help="Specify the name of the input organism whose genome you want to annotate with the provided pangenome.") - - required_single.add_argument('--single_fasta_file', required=False, type=Path, - help="Provide the file path to the genomic sequence(s) in FASTA format for the genome you wish to annotate. " - "(Fasta files can be compressed using gzip)") + help="Specify the name of the organism whose genome you want to annotate when providing a single FASTA or annotation file.") - required_single.add_argument('--single_annot_file', required=False, type=Path, - help="Provide the file path to the annotations in GFF/GBFF format for the genome you want to annotate. " - "(Annotation files can be compressed using gzip)") - required_single.add_argument('--circular_contigs', nargs="+", required=False, type=tuple, - help="Contigs of the input genome to consider as circular.") + help="Specify the contigs of the input genome that should be treated as circular when providing a single FASTA or annotation file.") + optional = parser.add_argument_group(title="Optional arguments") diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index fa9b3f40..1ac197b6 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -92,7 +92,7 @@ def check_tsv_sanity(tsv: Path): except IOError as ios_error: raise IOError(ios_error) except Exception as exception_error: - raise Exception(f"The following unexpected error happened when opening the list of pangenomes : " + raise Exception(f"The following unexpected error happened when opening the list of genomes path: " f"{exception_error}") else: name_set = set() @@ -319,6 +319,8 @@ def detect_filetype(filename: Path) -> str: return 'gff' elif first_line.startswith(">"): return 'fasta' + elif "\t" in first_line: + return "tsv" else: raise Exception("Filetype was not gff3 (file starts with '##gff-version 3') " "nor gbff/gbk (file starts with 'LOCUS '). " From 577a4280cef2451047776a65515ab03c925bcbb4 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Sep 2023 14:20:49 +0200 Subject: [PATCH 119/173] remove typo in main.yml --- .github/workflows/main.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3f07c28b..9c4510d4 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -140,8 +140,3 @@ jobs: --organism_name chlam_A --fasta FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \ --spot_graph --graph_formats graphml --fast --keep_tmp -f -Z - - - - From a30515a6fbb9ff89aa6a8e493b8f10fb5fedaf43 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Sep 2023 15:31:58 +0200 Subject: [PATCH 120/173] fix wrong typing --- ppanggolin/projection/projection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 7fb39a32..3bd66b85 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -1054,7 +1054,7 @@ def project_and_write_modules(pangenome: Pangenome, input_organisms: Iterable[Or return input_orgs_to_modules -def determine_input_mode(input_file: Path, expected_types: list[str], parser: argparse.ArgumentParser) -> str: +def determine_input_mode(input_file: Path, expected_types: List[str], parser: argparse.ArgumentParser) -> str: """ Determine the input mode based on the provided input file and expected file types. From b6314573a25709c35d225ae15ad7ae88f741225c Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Sep 2023 15:59:56 +0200 Subject: [PATCH 121/173] rm useless output --- ppanggolin/align/alignOnPang.py | 12 +++--------- ppanggolin/projection/projection.py | 6 +++--- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 9e527450..59e27e4f 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -144,11 +144,9 @@ def map_input_gene_to_family_all_aln(aln_res: Path, outdir:Path, pangenome: Pang seq2pang = {} aln_file_clean = outdir / f"alignment_input_seqs_to_all_pangenome_genes.tsv" # write the actual result file - input_seq_to_gene_family = outdir / f"input_seqs_to_gene_family.tsv" logging.getLogger().debug(f'Writing alignment file in {aln_file_clean}') - logging.getLogger().debug(f'Writing gene family IDs to the input sequence ID file: {input_seq_to_gene_family}') - with open(aln_res, "r") as alnFile, open(aln_file_clean, "w") as aln_outfl, open(input_seq_to_gene_family, "w") as outgene2fam: + with open(aln_res, "r") as alnFile, open(aln_file_clean, "w") as aln_outfl: for line in alnFile: line_splitted = line.split() @@ -162,7 +160,6 @@ def map_input_gene_to_family_all_aln(aln_res: Path, outdir:Path, pangenome: Pang if seq2pang.get(input_seq_id) is None: # if no results were found yet family = pangenome.get_gene(gene_id).family seq2pang[input_seq_id] = family # then the best hit is the first one we see. - outgene2fam.write(f"{input_seq_id}\t{family.name}\n") return seq2pang, aln_file_clean @@ -181,11 +178,9 @@ def map_input_gene_to_family_rep_aln(aln_res: Path, outdir:Path, pangenome: Pang seq2pang = {} aln_file_clean = outdir / f"alignment_input_seqs_to_pangenome_gene_families.tsv" # write the actual result file - input_seq_to_gene_family = outdir / f"input_seqs_to_gene_family.tsv" logging.getLogger().debug(f'Writing alignment file in {aln_file_clean}') - logging.getLogger().debug(f'Writing Gene family id to input seq id file in {input_seq_to_gene_family}') - with open(aln_res, "r") as alnFile, open(aln_file_clean, "w") as aln_outfl, open(input_seq_to_gene_family, "w") as outgene2fam: + with open(aln_res, "r") as alnFile, open(aln_file_clean, "w") as aln_outfl: for line in alnFile: line_splitted = line.split() @@ -198,8 +193,7 @@ def map_input_gene_to_family_rep_aln(aln_res: Path, outdir:Path, pangenome: Pang if seq2pang.get(input_seq_id) is None: # if no results were found yet family = pangenome.get_gene_family(gene_family_id) # then the best hit is the first one we see. - seq2pang[input_seq_id] = family - outgene2fam.write(f"{input_seq_id}\t{family.name}\n") + seq2pang[input_seq_id] = family return seq2pang, aln_file_clean diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 3bd66b85..208911db 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -513,11 +513,11 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: if use_representatives: - _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome, seq_fasta_files, output, new_tmpdir, is_input_seq_nt=True, + _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome, seq_fasta_files, output=new_tmpdir, tmpdir=new_tmpdir, is_input_seq_nt=True, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table) else: _, seqid_to_gene_family = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=seq_fasta_files, - output=output, tmpdir=new_tmpdir, is_input_seq_nt=True, + output=new_tmpdir, tmpdir=new_tmpdir, is_input_seq_nt=True, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, disable_bar=disable_bar) input_org_to_lonely_genes_count = {} @@ -1189,7 +1189,7 @@ def parser_projection(parser: argparse.ArgumentParser): optional = parser.add_argument_group(title="Optional arguments") optional.add_argument('-o', '--output', required=False, type=Path, - default="ppanggolin_output" + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", + default="ppanggolin_projection" + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", time.localtime()) + "_PID" + str(os.getpid()), help="Output directory") From 1ac16a55ed81c27ea6f92ec73eeb9fce59f3d1ae Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Sep 2023 16:07:37 +0200 Subject: [PATCH 122/173] add gene to gf per input --- ppanggolin/align/alignOnPang.py | 21 +++++++++++++++++++++ ppanggolin/projection/projection.py | 12 ++++++------ 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 59e27e4f..e06d67a0 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -276,6 +276,27 @@ def project_and_write_partition(seqid_to_gene_family: Dict[str, GeneFamily], seq partProjFile.write(remainingSeq + "\tcloud\n") # if there is no hit, it's going to be cloud genes. return partition_proj +def write_gene_to_gene_family(seqid_to_gene_family: Dict[str, GeneFamily], seq_set: Set[str], output: Path) -> Path: + """ + Write input gene to gene family. + + :param seqid_to_gene_family: dictionnary which link sequence and pangenome gene family + :param seq_set: input sequences + :param output: Path of the output directory + + :return: Path to file which contain partition projection + """ + + gene_fam_map_file = output.absolute() / "gene_to_gene_family.tsv" + with open(gene_fam_map_file, "w") as partProjFile: + for input_seq, pangFam in seqid_to_gene_family.items(): + partProjFile.write(f"{input_seq}\t{pangFam.name}\n") + + for remainingSeq in seq_set - seqid_to_gene_family.keys(): + partProjFile.write(f"{remainingSeq}\t{remainingSeq}\n") # if there is no hit, gene family is itself. + + return gene_fam_map_file + def get_fam_to_rgp(pangenome, multigenics: set) -> dict: """ diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 208911db..ec85ea05 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -23,19 +23,17 @@ # # local libraries -from ppanggolin.annotate.synta import annotate_organism, read_fasta, get_dna_sequence -from ppanggolin.annotate.annotate import read_anno_file, launch_read_anno, launch_annotate_organism, local_identifiers_are_unique +from ppanggolin.annotate.synta import read_fasta, get_dna_sequence +from ppanggolin.annotate.annotate import launch_read_anno, launch_annotate_organism, local_identifiers_are_unique from ppanggolin.annotate import subparser as annotate_subparser from ppanggolin.pangenome import Pangenome -# from ppanggolin.genome import input_organism, Gene, RNA, Contig from ppanggolin.utils import detect_filetype, create_tmpdir, read_compressed_or_not, write_compressed_or_not, restricted_float, mk_outdir, get_config_args, parse_config_file, get_default_args, check_input_files -from ppanggolin.align.alignOnPang import get_input_seq_to_family_with_rep,get_input_seq_to_family_with_all, project_and_write_partition +from ppanggolin.align.alignOnPang import write_gene_to_gene_family, get_input_seq_to_family_with_rep,get_input_seq_to_family_with_all, project_and_write_partition from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations from ppanggolin.formats.readBinaries import check_pangenome_info -# from ppanggolin.formats import write_pangenome from ppanggolin.RGP.genomicIsland import naming_scheme, compute_org_rgp from ppanggolin.RGP.spot import make_spot_graph, check_sim, add_new_node_in_spot_graph, write_spot_graph -from ppanggolin.genome import Organism, Gene, RNA, Contig +from ppanggolin.genome import Organism from ppanggolin.geneFamily import GeneFamily from ppanggolin.region import Region, Spot, Module from ppanggolin.formats.writeFlat import summarize_spots @@ -528,6 +526,8 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org seq_set = {gene.ID if gene.local_identifier == "" else gene.local_identifier for gene in input_organism.genes} project_and_write_partition(seqid_to_gene_family, seq_set, org_outdir) + + write_gene_to_gene_family(seqid_to_gene_family, seq_set, org_outdir) lonely_genes = set() for gene in input_organism.genes: From 41ad67caf7c647cbe0367f352a8e8cc7597c85e3 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Sep 2023 18:30:19 +0200 Subject: [PATCH 123/173] use better fct name --- ppanggolin/projection/projection.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index ec85ea05..e5c8df73 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -1054,7 +1054,7 @@ def project_and_write_modules(pangenome: Pangenome, input_organisms: Iterable[Or return input_orgs_to_modules -def determine_input_mode(input_file: Path, expected_types: List[str], parser: argparse.ArgumentParser) -> str: +def infer_input_mode(input_file: Path, expected_types: List[str], parser: argparse.ArgumentParser) -> str: """ Determine the input mode based on the provided input file and expected file types. @@ -1101,11 +1101,11 @@ def check_projection_arguments(args: argparse.Namespace, parser: argparse.Argume mode_from_fasta, mode_from_anno = None, None if args.fasta: - mode_from_fasta = determine_input_mode(args.fasta, ['fasta'], parser) + mode_from_fasta = infer_input_mode(args.fasta, ['fasta'], parser) input_mode = mode_from_fasta if args.anno: - mode_from_anno = determine_input_mode(args.anno, ['gff', "gbff"], parser) + mode_from_anno = infer_input_mode(args.anno, ['gff', "gbff"], parser) input_mode = mode_from_anno logging.getLogger('PPanGGOLiN').debug("") From 5c50430d5f48c0f3202131a4306e3121460a4bb3 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Sep 2023 18:34:23 +0200 Subject: [PATCH 124/173] add first draft for projection documentation --- docs/user/projection.md | 63 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 docs/user/projection.md diff --git a/docs/user/projection.md b/docs/user/projection.md new file mode 100644 index 00000000..e8e35989 --- /dev/null +++ b/docs/user/projection.md @@ -0,0 +1,63 @@ +# Projection command +The ppanggolin projection command allows you to annotate external genomes using an existing pangenome. This process eliminates the need to recompute all components, streamlining the annotation process. Input genomes are expected to belong to the same species. + +Genes within the input genome are aligned with genes in the pangenome to determine their gene families and partitions. Genes that do not align with any existing gene in the pangenome are considered specific to the input genome and are assigned to the "Cloud" partition. Based on the alignment and partition assignment, Regions of Plasticity (RGPs) within the input genome are predicted. Each RGP that is not located on a contig border is assigned to a spot of insertion. Finally, conserved modules of the pangenome found in the input genome are reported in the output files. + +## Input files: + +This command supports two input modes depending on whether you want to project a single genome or multiple genomes at once: + +Multiple Files in One TSV: +- **Options**: `--fasta` or `--anno` +- **Description**: You can provide a tab-separated file listing organism names alongside their respective FASTA genomic sequences or annotation filepaths, with one line per organism. This mode is suitable when you want to annotate multiple genomes in a single operation. The format of this file is identical to the format used in the annotate and workflow commands; for more details, refer here. + +Single File: +- **Options**: `--organism_name` with `--fasta` or `--anno` and `--circular_contigs` (optional) +- **Description**: When annotating a single genome, you can directly provide a single FASTA genomic sequence file or an annotation file in GFF/GBFF format. Additionally, specify the name of the organism using the `--organism_name` option. You can also indicate circular contigs using the `--circular_contigs` option when necessary. + + +## Output files: + +The Output directory contains `summary_projection.tsv` giving an overview of the projection. one line per organism. + + +| Column | Description| +|--------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Organism name | This column contains name or identifier of the organisms being analyzed.| +| Pangenome file | The path to the pangenome file (pangenome.h5) used for the analysis.| +| Contigs | The number of contigs in the projected genome.| +| Genes | The total number of genes identified in the input genome.| +| Families | The total number of gene families to which genes in the genome of the input organism are assigned.| +| Persistent genes | The number of genes in the "Persistent" partition.| +| Persistent families | The number of gene families in the "Persistent" partition.| +| Shell genes | The number of genes in the "Shell" partition.| +| Shell families | The number of gene families in the "Shell" partition.| +| Cloud genes | The number of genes in the "Cloud" partition.| +| Cloud families | The number of gene families in the "Cloud" parition.| +| Cloud specific families | The number of gene families that are specific to the input organism. These families are unique to the input organism and do not have homologs in any other genomes within the pangenome and have been assigned to the "Cloud" partition.| +| RGPs (Regions of Genomic Plasticity) | The number of Regions of Genomic Plasticity (RGPs) predicted within the input genome.| +| Spots | The total number of spots of insertion associated with RGPs in the input genome.| +| New spots | The number of new insertion spots that have been identified in the input genome. These spots represent novel genomic regions compared to other genomes in the pangenome.| +| Modules | The number of modules that have been projected onto the input genome.| + + +Additionally, within the Output directory, there is a subdirectory for each input genome, named after the input genome itself. Each of these subdirectories contains several files: + +For Gene Family and Partition of Input Genes: + +- `cds_sequences.fasta`: This file contains the sequences of coding regions (CDS) from the input genome. +- `gene_to_gene_family.tsv`: It provides the mapping of genes to gene families of the pangenome. its format follows [this output](Outputs.md#gene-families-and-genes) +- `sequences_partition_projection.tsv`: This file maps the input genes to its partition (Persistent, Shell or Cloud). +- `specific_genes.tsv`: This file list the gene of the input genomes that do not align to any gene of the pangenome. These genes are assigned to Cloud parititon. + +For RGPs and Spots: + +- `plastic_regions.tsv`: This file contains information about Regions of Genomic Plasticity (RGPs) within the input genome. Its format follows [this output](Outputs.md#plastic-regions). +- `input_organism_rgp_to_spot.tsv`: It provides information about the association between RGPs and insertion spots in the input genome. Its format follows [this ouput](Outputs.md#spots). + +Optionally, you can produce a graph of the RGPs using the `--spot_graph` option. This graph is similar as the one produce by the `ppanggolin spot` command. + +For Modules: + +- `modules_in_input_organism.tsv`: This file lists the modules that have been found in the input genome. Its format follows [this ouput](Outputs.md#modules-in-organisms). + From 935d1f4e47023a086f8110394171a01b3e7f94f9 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 19 Sep 2023 12:15:01 +0200 Subject: [PATCH 125/173] make organism name optional --- ppanggolin/projection/projection.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index e5c8df73..156ae4b7 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -1120,24 +1120,15 @@ def check_projection_arguments(args: argparse.Namespace, parser: argparse.Argume if input_mode == "multiple": # We are in paths file mode - incompatible_args = ["organism_name", "circular_contigs"] - for single_arg in incompatible_args: - if getattr(args, single_arg) is not None: - parser.error("You provided a TSV file listing the files of genomes you wish to annotate. " - f"Therefore, the single genome argument '--{single_arg}' is incompatible with this multiple genomes file.") + if args.circular_contigs: + parser.error("You provided a TSV file listing the files of genomes you wish to annotate. " + f"Therefore, the argument '--circular_contigs' is incompatible with this multiple genomes file.") if args.fasta: check_input_files(args.fasta, True) if args.anno: check_input_files(args.anno, True) - - elif input_mode == "single": - # We are in single file mode - - if args.organism_name is None: - parser.error("You directly provided a single FASTA/GBFF/GFF file. Please specify the name of the input organism you want to annotate. " - "You can use the --organism_name argument either through the command line or the config file.") return input_mode @@ -1179,7 +1170,7 @@ def parser_projection(parser: argparse.ArgumentParser): required_single = parser.add_argument_group(title="Single Genome Arguments", description="Use these options when providing a single FASTA or annotation file:") - required_single.add_argument("-n", '--organism_name', required=False, type=str, + required_single.add_argument("-n", '--organism_name', required=False, type=str, default="input_genome", help="Specify the name of the organism whose genome you want to annotate when providing a single FASTA or annotation file.") required_single.add_argument('--circular_contigs', nargs="+", required=False, type=tuple, From 9200a511c3d2865c002bbf979a52a8d2805a963e Mon Sep 17 00:00:00 2001 From: Adelme Bazin Date: Tue, 19 Sep 2023 20:48:22 +0200 Subject: [PATCH 126/173] Update annotate.py --- ppanggolin/annotate/annotate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index f76f3a91..7fe15dbd 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -9,7 +9,7 @@ from pathlib import Path import tempfile import time -from typing import List, Set, Tuple +from typing import List, Set, Tuple, Iterable # installed libraries from tqdm import tqdm From 14f935704faf3f463eee43d53bf1d89cb669786a Mon Sep 17 00:00:00 2001 From: Adelme Bazin Date: Wed, 20 Sep 2023 20:19:44 +0200 Subject: [PATCH 127/173] add logging if genes share start position --- ppanggolin/genome.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 6552348e..9f3e0498 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -331,7 +331,7 @@ def __setitem__(self, start: int, gene: Gene): if not isinstance(gene, Gene): raise TypeError(f"'Gene' type was expected but you provided a '{type(gene)}' type object") if start in self._genes_getter: - raise ValueError(f"Gene with start position {start} already exists in the contig") + raise ValueError(f"Gene '{self._genes_getter[start].ID}' with start position {start} already exists in the contig '{self.name}', cannot add gene '{gene.ID}'") if gene.position is None: raise AttributeError("The gene object needs to have its position in the contig filled before adding it") # Adding empty values. From 3a42d78eee4b58913536081bfa73f6a198554d39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 21 Sep 2023 11:40:22 +0200 Subject: [PATCH 128/173] fix problem to read contig length with source in gbff --- VERSION | 2 +- ppanggolin/annotate/annotate.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/VERSION b/VERSION index 0da01cf8..1b3db87e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.187 +1.2.188 diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 7fe15dbd..411fd2d3 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -115,14 +115,15 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li # beginning of contig is_circ = False contig_id = None + contig_len = None if line.startswith('LOCUS'): if "CIRCULAR" in line.upper(): # this line contains linear/circular word telling if the dna sequence is circularized or not is_circ = True # TODO maybe it could be a good thing to add a elif for linear # and if circular or linear are not found raise a warning - contig_id = line.split()[1] + contig_len = int(line.split()[2]) # If contig_id is not specified in VERSION afterward like with Prokka, in that case we use the one in LOCUS while not line.startswith('FEATURES'): if line.startswith('VERSION') and line[12:].strip() != "": @@ -136,6 +137,7 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li except KeyError: contig = Contig(contig_id, True if contig_id in circular_contigs or is_circ else False) organism.add(contig) + contig.length = contig_len # start of the feature object. dbxref = set() gene_name = "" @@ -185,9 +187,6 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li pass # don't know what to do with that, ignoring for now. # there is a protein with a frameshift mecanism. - elif curr_type == 'source': # Get Contig length - start, end = map(int, map(str.strip, line[21:].split('..'))) - contig.length = end - start + 1 elif useful_info: # current info goes to current objtype, if it's useful. if line[21:].startswith("/db_xref"): dbxref.add(line.split("=")[1].replace('"', '').strip()) From d1f22317d55603caac17b5e512d84b983022be1c Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 21 Sep 2023 13:47:46 +0200 Subject: [PATCH 129/173] add gene line for each CDS ARN --- ppanggolin/formats/writeFlat.py | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 5f9f1da0..13320a33 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -684,8 +684,13 @@ def write_gff_file(org: Organism, contig_to_rgp: Dict[Contig, Region], strand = feature.strand source = annotation_sources.get(feat_type, "external") + + # before the CDS or RNA line a gene line is created. with the following id + parent_gene_id=f"gene-{feature.ID}" + attributes = [("ID", feature.ID), ("Name", feature.name), + ('Parent', parent_gene_id), ("product", feature.product), ] @@ -697,14 +702,32 @@ def write_gff_file(org: Organism, contig_to_rgp: Dict[Contig, Region], ("Family", feature.family.name), ("Partition", feature.family.named_partition), ('RGP', rgp), - ('Module', ','.join((f"module_{module.ID}" for module in feature.family.modules)) ) + ('Module', ','.join((f"module_{module.ID}" for module in feature.family.modules)) ) ] + + + # add an extra line of type gene + + gene_line = [contig.name, + source, + 'gene', + feature.start, + feature.stop, + '.', + strand, + ".", + f'ID={parent_gene_id}' + ] + + line_str = '\t'.join(map(str, gene_line)) + outfile.write(line_str + "\n") + elif type(feature) == Region: feat_type = "region" source = "ppanggolin" strand = "." - score = feature.score # TODO is RGP score make sens and do we want it in gff file? + score = feature.score # TODO does RGP score make sens and do we want it in gff file? attributes = [ ("Name", feature.name), ("Spot", rgp_to_spotid.get(feature, "No_spot")), @@ -728,6 +751,7 @@ def write_gff_file(org: Organism, contig_to_rgp: Dict[Contig, Region], ".", attributes_str, ] + line_str = '\t'.join(map(str, line)) outfile.write(line_str + "\n") From 8c36cce5b99353b6de593ce5ddae5b00921d5e82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 21 Sep 2023 17:32:54 +0200 Subject: [PATCH 130/173] Add ID to contig --- VERSION | 2 +- ppanggolin/align/alignOnPang.py | 12 ++-- ppanggolin/annotate/annotate.py | 88 ++++++++++++++----------- ppanggolin/annotate/synta.py | 16 ++++- ppanggolin/context/searchGeneContext.py | 2 +- ppanggolin/formats/readBinaries.py | 59 +++++++++-------- ppanggolin/formats/writeAnnotations.py | 21 +++--- ppanggolin/genome.py | 7 +- ppanggolin/pangenome.py | 45 +++++++++---- ppanggolin/projection/projection.py | 55 ++++++++++------ tests/region/test_rgp_cluster.py | 2 +- tests/test_genefamily.py | 6 +- tests/test_genome.py | 16 ++--- tests/test_pangenome.py | 10 +-- tests/test_region.py | 14 ++-- 15 files changed, 211 insertions(+), 144 deletions(-) diff --git a/VERSION b/VERSION index 1b3db87e..3cff2901 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.188 +1.2.189 diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index e06d67a0..48af789c 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -555,13 +555,13 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo if use_representatives: align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, [sequence_file], output, new_tmpdir, is_input_seq_nt=is_nucleotide, - cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, - translation_table=translation_table, disable_bar=disable_bar) + cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, + translation_table=translation_table, disable_bar=disable_bar) else: - align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_file=[sequence_file], - output=output, tmpdir=new_tmpdir, is_input_seq_nt=is_nucleotide, - cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, - translation_table=translation_table, disable_bar=disable_bar) + align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=[sequence_file], + output=output, tmpdir=new_tmpdir, is_input_seq_nt=is_nucleotide, + cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, + translation_table=translation_table, disable_bar=disable_bar) if getinfo or draw_related: # TODO Add getinfo to function and remove if get_seq_info(seq2pang, pangenome, output, draw_related, disable_bar=disable_bar) diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 411fd2d3..5d4c5624 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -4,6 +4,7 @@ # default libraries import argparse import logging +from concurrent.futures import ProcessPoolExecutor from multiprocessing import get_context import os from pathlib import Path @@ -15,13 +16,16 @@ from tqdm import tqdm # local libraries -from ppanggolin.annotate.synta import annotate_organism, read_fasta, get_dna_sequence +from ppanggolin.annotate.synta import annotate_organism, read_fasta, get_dna_sequence, init_contig_counter, contig_counter from ppanggolin.pangenome import Pangenome from ppanggolin.genome import Organism, Gene, RNA, Contig from ppanggolin.utils import read_compressed_or_not, mk_file_name, detect_filetype, check_input_files from ppanggolin.formats import write_pangenome +ctg_counter = contig_counter + + def check_annotate_args(args: argparse.Namespace): """Check That the given arguments are usable @@ -104,6 +108,8 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li :return: Organism complete and true for sequence in file """ + global ctg_counter + organism = Organism(organism_name) logging.getLogger("PPanGGOLiN").debug(f"Extracting genes informations from the given gbff {gbff_file_path.name}") # revert the order of the file, to read the first line first. @@ -135,7 +141,10 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li try: contig = organism.get(contig_id) except KeyError: - contig = Contig(contig_id, True if contig_id in circular_contigs or is_circ else False) + with contig_counter.get_lock(): + contig = Contig(contig_counter.value, contig_id, + True if contig_id in circular_contigs or is_circ else False) + contig_counter.value += 1 organism.add(contig) contig.length = contig_len # start of the feature object. @@ -247,6 +256,8 @@ def read_org_gff(organism: str, gff_file_path: Path, circular_contigs: List[str] :return: Organism object and if there are sequences associated or not """ + global ctg_counter + (gff_seqname, _, gff_type, gff_start, gff_end, _, gff_strand, _, gff_attribute) = range(0, 9) # Missing values: source, score, frame. They are unused. @@ -299,7 +310,10 @@ def get_id_attribute(attributes_dict: dict) -> str: has_fasta = True elif line.startswith('sequence-region', 2, 17): fields = [el.strip() for el in line.split()] - contig = Contig(fields[1], True if fields[1] in circular_contigs else False) + with contig_counter.get_lock(): + contig = Contig(contig_counter.value, fields[1], + True if fields[1] in circular_contigs else False) + contig_counter.value += 1 org.add(contig) contig.length = int(fields[-1]) - int(fields[3]) + 1 @@ -350,7 +364,6 @@ def get_id_attribute(attributes_dict: dict) -> str: rna_counter += 1 contig.add_rna(rna) - # GET THE FASTA SEQUENCES OF THE GENES if has_fasta and fasta_string != "": contig_sequences = read_fasta(org, fasta_string.split('\n')) # _ is total contig length @@ -362,16 +375,6 @@ def get_id_attribute(attributes_dict: dict) -> str: return org, has_fasta -def launch_read_anno(args: Tuple[str, Path, List[str], bool]) -> Tuple[Organism, bool]: - """ Allow to launch in multiprocessing the read of genome annotation - - :param args: Pack of argument for annotate_organism function - - :return: Organism object for pangenome - """ - return read_anno_file(*args) - - def read_anno_file(organism_name: str, filename: Path, circular_contigs: list, pseudo: bool = False) -> Tuple[Organism, bool]: """ @@ -384,6 +387,7 @@ def read_anno_file(organism_name: str, filename: Path, circular_contigs: list, p :return: Annotated organism for pangenome and true for sequence in file """ + global ctg_counter filetype = detect_filetype(filename) if filetype == "gff": try: @@ -420,7 +424,8 @@ def chose_gene_identifiers(pangenome: Pangenome) -> bool: else: return False - + + def local_identifiers_are_unique(genes: Iterable[Gene]) -> bool: """ Check if local_identifiers of genes are uniq in order to decide if they should be used as gene id. @@ -452,6 +457,7 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p :param pseudo: allow to read pseudogène :param disable_bar: Disable the progresse bar """ + logging.getLogger("PPanGGOLiN").info(f"Reading {organisms_file.name} the list of organism files ...") pangenome.status["geneSequences"] = "Computed" @@ -464,12 +470,22 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p if not org_path.exists(): # Check tsv sanity test if it's not one it's the other org_path = organisms_file.parent.joinpath(org_path) args.append((elements[0], org_path, elements[2:], pseudo)) - with get_context('fork').Pool(cpu) as p: - for org, flag in tqdm(p.imap_unordered(launch_read_anno, args), unit="file", total=len(args), - disable=disable_bar): - pangenome.add_organism(org) - if not flag: - pangenome.status["geneSequences"] = "No" + + with ProcessPoolExecutor(mp_context=get_context('fork'), max_workers=cpu, + initializer=init_contig_counter, initargs=(contig_counter, )) as executor: + with tqdm(total=len(args), unit="file", disable=disable_bar) as progress: + futures = [] + + for fn_args in args: + future = executor.submit(read_anno_file, *fn_args) + future.add_done_callback(lambda p: progress.update()) + futures.append(future) + + for future in futures: + org, flag = future.result() + pangenome.add_organism(org) + if not flag: + pangenome.status["geneSequences"] = "No" # decide whether we use local ids or ppanggolin ids. used_local_identifiers = chose_gene_identifiers(pangenome) @@ -529,16 +545,6 @@ def get_gene_sequences_from_fastas(pangenome: Pangenome, fasta_files: List[Path] pangenome.status["geneSequences"] = "Computed" -def launch_annotate_organism(pack: Tuple[str, Path, List[str], str, int, bool, str, bool, str]) -> Organism: - """ Allow to launch in multiprocessing the genome annotation - - :param pack: Pack of argument for annotate_organism function - - :return: Organism object for pangenome - """ - return annotate_organism(*pack) - - def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: int = 1, translation_table: int = 11, kingdom: str = "bacteria", norna: bool = False, allow_overlap: bool = False, procedure: str = None, disable_bar: bool = False): @@ -575,12 +581,18 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: raise Exception("There are no genomes in the provided file") logging.getLogger("PPanGGOLiN").info(f"Annotating {len(arguments)} genomes using {cpu} cpus...") - with get_context('fork').Pool(processes=cpu) as p: - for organism in tqdm(p.imap_unordered(launch_annotate_organism, arguments), unit="genome", - total=len(arguments), disable=disable_bar): - pangenome.add_organism(organism) - p.close() - p.join() + with ProcessPoolExecutor(mp_context=get_context('fork'), max_workers=cpu, + initializer=init_contig_counter, initargs=(contig_counter,)) as executor: + with tqdm(total=len(arguments), unit="file", disable=disable_bar) as progress: + futures = [] + + for fn_args in arguments: + future = executor.submit(annotate_organism, *fn_args) + future.add_done_callback(lambda p: progress.update()) + futures.append(future) + + for future in futures: + pangenome.add_organism(future.result()) logging.getLogger("PPanGGOLiN").info("Done annotating genomes") pangenome.status["genomesAnnotated"] = "Computed" # the pangenome is now annotated. @@ -591,9 +603,9 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: pangenome.parameters["annotate"]["translation_table"] = translation_table pangenome.parameters["annotate"]["prodigal_procedure"] = None if procedure is None else procedure pangenome.parameters["annotate"]["allow_overlap"] = allow_overlap - pangenome.parameters["annotate"]["norna"] = norna pangenome.parameters["annotate"]["# read_annotations_from_file"] = False + def launch(args: argparse.Namespace): """ Command launcher diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index c21dbdfd..a72de538 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -6,6 +6,7 @@ import os import tempfile from io import TextIOWrapper +from multiprocessing import Value from subprocess import Popen, PIPE import ast from collections import defaultdict @@ -17,6 +18,15 @@ from ppanggolin.utils import is_compressed, read_compressed_or_not +contig_counter: Value = Value('i', 0) + + +def init_contig_counter(value: Value): + """Initialize the contig counter for later use""" + global contig_counter + contig_counter = value + + def reverse_complement(seq: str): """reverse complement the given dna sequence @@ -160,6 +170,8 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str, :return: Dictionnary with contig_name as keys and contig sequence in values """ + global contig_counter + try: contigs = {} contig_seq = "" @@ -173,7 +185,9 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str, try: contig = org.get(line.split()[0][1:]) except KeyError: - contig = Contig(line.split()[0][1:]) + with contig_counter.get_lock(): + contig = Contig(contig_counter.value, line.split()[0][1:]) + contig_counter.value += 1 org.add(contig) else: contig_seq += line.strip() diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index e1403a5f..730b47e2 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -68,7 +68,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, disable_bar=disable_bar) else: - _, seq2pan = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_file=[sequence_file], + _, seq2pan = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=[sequence_file], output=output, tmpdir=new_tmpdir, is_input_seq_nt=is_nucleotide, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 9b781cf6..5b5a2af6 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -3,10 +3,8 @@ # default libraries import logging -import sys from pathlib import Path -from typing import TextIO, Dict, Any , List, Tuple - +from typing import TextIO, Dict, Any, List # installed libraries from tables import Table @@ -21,7 +19,6 @@ from ppanggolin.metadata import Metadata - class Genedata: """ This is a general class storing unique gene-related data to be written in a specific @@ -106,6 +103,7 @@ def fix_partitioned(pangenome_file: str): del status_group._v_attrs.Partitionned h5f.close() + def get_status(pangenome: Pangenome, pangenome_file: Path): """ Checks which elements are already present in the file. @@ -139,7 +137,6 @@ def get_status(pangenome: Pangenome, pangenome_file: Path): if hasattr(status_group._v_attrs, "modules") and status_group._v_attrs.modules: pangenome.status["modules"] = "inFile" - # pangenome.status["annotations_sources"] = status_group._v_attrs.annotations_sources if hasattr(status_group._v_attrs, "metadata") and status_group._v_attrs.metadata: metastatus = status_group.metastatus @@ -153,6 +150,7 @@ def get_status(pangenome: Pangenome, pangenome_file: Path): pangenome.parameters = info_group._v_attrs.parameters h5f.close() + def read_chunks(table: Table, column: str = None, chunk: int = 10000): """ Reading entirely the provided table (or column if specified) chunk per chunk to limit RAM usage. @@ -202,8 +200,9 @@ def read_sequences(h5f: tables.File) -> dict: seqid2seq[row["seqid"]] = row['dna'].decode() return seqid2seq + def get_non_redundant_gene_sequences_from_file(pangenome_filename: str, file_obj: TextIO, add: str = '', - disable_bar: bool = False): + disable_bar: bool = False): """ Writes the non redundant CDS sequences of the Pangenome object to a File object that can be filtered or not by a list of CDS, and adds the eventual str 'add' in front of the identifiers. Loads the sequences from a .h5 pangenome file. @@ -215,21 +214,21 @@ def get_non_redundant_gene_sequences_from_file(pangenome_filename: str, file_obj """ - logging.getLogger("PPanGGOLiN").info(f"Extracting and writing non redundant CDS sequences from {pangenome_filename} to {file_obj.name}") - + logging.getLogger("PPanGGOLiN").info( + f"Extracting and writing non redundant CDS sequences from {pangenome_filename} to {file_obj.name}") + with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f: # get a dictionarry mapping seqid to cds_name # seqid are uniq and can have multiple cds name. # We just want one of the cds name to have non redundant fasta sequences seqid2cds_name = {} - for row in read_chunks(h5f.root.geneSequences, chunk=20000): + for row in read_chunks(h5f.root.annotations.geneSequences, chunk=20000): # Read the table chunk per chunk otherwise RAM dies on big pangenomes seqid2cds_name[row["seqid"]] = row["gene"].decode() - table = h5f.root.sequences + table = h5f.root.annotations.sequences for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): - cds_name = seqid2cds_name[row["seqid"]] file_obj.write(f'>{add}{cds_name}\n') file_obj.write(f'{row["dna"].decode()}\n') @@ -249,7 +248,8 @@ def get_gene_sequences_from_file(pangenome_filename: str, file_obj: TextIO, list :param add: Add a prefix to sequence header :param disable_bar: Prevent to print disable progress bar """ - logging.getLogger("PPanGGOLiN").info(f"Extracting and writing CDS sequences from a {pangenome_filename} file to a fasta file...") + logging.getLogger("PPanGGOLiN").info( + f"Extracting and writing CDS sequences from a {pangenome_filename} file to a fasta file...") h5f = tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) table = h5f.root.annotations.geneSequences @@ -424,6 +424,7 @@ def read_modules(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = Fal pangenome.add_module(module) pangenome.status["modules"] = "Loaded" + def read_organisms(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20000, disable_bar: bool = False): """Read organism table in pangenome file to add them to the pangenome object @@ -433,14 +434,13 @@ def read_organisms(pangenome: Pangenome, table: tables.Table, chunk_size: int = :param chunk_size: Size of the chunck reading :param disable_bar: Disable progress bar """ - contig2organism = {} for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="genome", disable=disable_bar): organism = Organism(row["name"].decode()) pangenome.add_organism(organism) def read_contigs(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20000, - disable_bar: bool = False): + disable_bar: bool = False): """Read contig table in pangenome file to add them to the pangenome object :param pangenome: Pangenome object @@ -449,8 +449,7 @@ def read_contigs(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20 :param disable_bar: Disable progress bar """ for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="contig", disable=disable_bar): - contig = Contig(name=row["name"].decode()) - contig.is_circular = row["is_circular"] + contig = Contig(identifier=int(row["ID"]), name=row["name"].decode(), is_circular=row["is_circular"]) contig.length = int(row["length"]) try: organism = pangenome.get_organism(row["organism"].decode()) @@ -459,6 +458,7 @@ def read_contigs(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20 else: organism.add(contig) + def read_genes(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[int, Genedata], link: bool = True, chunk_size: int = 20000, disable_bar: bool = False): """Read genes in pangenome file to add them to the pangenome object @@ -482,7 +482,7 @@ def read_genes(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[in genetic_code=genedata.genetic_code, product=genedata.product, local_identifier=local) gene.is_fragment = row["is_fragment"] if link: - contig = pangenome.get_contig(row["contig"].decode()) + contig = pangenome.get_contig(int(row["contig"])) gene.fill_parents(contig.organism, contig) contig.add(gene) @@ -505,13 +505,14 @@ def read_rnas(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[int gene_type=genedata.gene_type, name=genedata.name, product=genedata.product) if link: - contig = pangenome.get_contig(row["contig"].decode()) + contig = pangenome.get_contig(int(row["contig"])) rna.fill_parents(contig.organism, contig) contig.add_rna(rna) def read_annotation(pangenome: Pangenome, h5f: tables.File, load_organisms: bool = True, load_contigs: bool = True, - load_genes: bool = True, load_rnas: bool = True, chunk_size: int = 20000, disable_bar: bool = False): + load_genes: bool = True, load_rnas: bool = True, chunk_size: int = 20000, + disable_bar: bool = False): """ Read annotation in pangenome hdf5 file to add in pangenome object @@ -522,22 +523,21 @@ def read_annotation(pangenome: Pangenome, h5f: tables.File, load_organisms: bool annotations = h5f.root.annotations genedata_dict = None if load_organisms: - read_organisms(pangenome, annotations.genomes, disable_bar=disable_bar) + read_organisms(pangenome, annotations.genomes, chunk_size=chunk_size, disable_bar=disable_bar) if load_contigs: - read_contigs(pangenome, annotations.contigs, disable_bar=disable_bar) + read_contigs(pangenome, annotations.contigs, chunk_size=chunk_size, disable_bar=disable_bar) if load_genes: genedata_dict = read_genedata(h5f) read_genes(pangenome, annotations.genes, genedata_dict, - all([load_organisms, load_contigs]), disable_bar=disable_bar) + all([load_organisms, load_contigs]), chunk_size=chunk_size, disable_bar=disable_bar) if load_rnas: read_rnas(pangenome, annotations.RNAs, read_genedata(h5f) if genedata_dict is None else genedata_dict, - all([load_organisms, load_contigs]), disable_bar=disable_bar) + all([load_organisms, load_contigs]), chunk_size=chunk_size, disable_bar=disable_bar) pangenome.status["genomesAnnotated"] = "Loaded" - def read_info(h5f: tables.File): """ Read the pangenome content @@ -655,6 +655,7 @@ def read_metadata(pangenome: Pangenome, h5f: tables.File, metatype: str, element.add_metadata(source=source, metadata=meta) pangenome.status["metadata"][metatype] = "Loaded" + def read_parameters(h5f: tables.File): """ Read pangenome parameters @@ -668,6 +669,7 @@ def read_parameters(h5f: tables.File): for param_name, val in param_name_to_value.items(): print(f" {param_name} : {val}") + def get_pangenome_parameters(h5f: tables.File) -> Dict[str, Dict[str, Any]]: """ Read and return the pangenome parameters. @@ -775,7 +777,8 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa if h5f.root.status._v_attrs.metadata: metastatus = h5f.root.status._f_get_child("metastatus") metasources = h5f.root.status._f_get_child("metasources") - if metastatus._v_attrs[metatype] and all([True if source in metasources._v_attrs[metatype] else False for source in sources]): + if metastatus._v_attrs[metatype] and all( + [True if source in metasources._v_attrs[metatype] else False for source in sources]): logging.getLogger().info(f"Reading the {metatype} metadata from sources {sources}...") read_metadata(pangenome, h5f, metatype, sources, disable_bar=disable_bar) else: @@ -783,6 +786,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa f"or has been improperly filled") h5f.close() + def check_pangenome_info(pangenome, need_annotations: bool = False, need_families: bool = False, need_graph: bool = False, need_partitions: bool = False, need_rgp: bool = False, need_spots: bool = False, need_gene_sequences: bool = False, need_modules: bool = False, @@ -864,10 +868,11 @@ def check_pangenome_info(pangenome, need_annotations: bool = False, need_familie if source in pangenome.status["metasources"][metatype]: metadata = True else: - raise Exception(f"There is no metadata assign to {metatype} for source : {source} in your pangenome.") + raise Exception( + f"There is no metadata assign to {metatype} for source : {source} in your pangenome.") else: metadata = True - elif not pangenome.status["metastatus"][metatype] in ["Computed", "Loaded"]: + elif pangenome.status["metastatus"][metatype] not in ["Computed", "Loaded"]: raise Exception(f"Your pangenome don't have any metadata for {metatype}. See the 'metadata' subcommand") if any([annotation, gene_families, graph, rgp, spots, gene_sequences, modules, metadata]): diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py index b09e4166..bb1de011 100644 --- a/ppanggolin/formats/writeAnnotations.py +++ b/ppanggolin/formats/writeAnnotations.py @@ -84,7 +84,8 @@ def contig_desc(contig_len: int, org_len: int) -> Dict[str, Union[tables.StringC :return: Formatted table """ - return {'name': tables.StringCol(itemsize=contig_len), + return {'ID': tables.UInt32Col(), + 'name': tables.StringCol(itemsize=contig_len), "is_circular": tables.BoolCol(dflt=False), 'length': tables.UInt32Col(), "organism": tables.StringCol(itemsize=org_len)} @@ -104,6 +105,7 @@ def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Gro logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_contigs} contigs") contig_row = contig_table.row for contig in tqdm(pangenome.contigs, total=pangenome.number_of_contigs, unit="contigs", disable=disable_bar): + contig_row["ID"] = contig.ID contig_row["name"] = contig.name contig_row["is_circular"] = contig.is_circular contig_row["length"] = len(contig) @@ -112,12 +114,11 @@ def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Gro contig_table.flush() -def gene_desc(id_len: int, max_local_id: int, max_contig_len: int) -> Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]]: +def gene_desc(id_len: int, max_local_id: int) -> Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]]: """Table description to save gene-related information :param id_len: Maximum size of gene name :param max_local_id: Maximum size of gene local identifier - :param max_contig_len: Maximum size of contig identifier :return: Formatted table """ @@ -125,7 +126,7 @@ def gene_desc(id_len: int, max_local_id: int, max_contig_len: int) -> Dict[str, 'genedata_id': tables.UInt32Col(), 'local': tables.StringCol(itemsize=max_local_id), 'is_fragment': tables.BoolCol(dflt=False), - 'contig': tables.StringCol(itemsize=max_contig_len)} + 'contig': tables.UInt32Col()} def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, @@ -150,7 +151,7 @@ def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Grou gene_row["ID"] = gene.ID gene_row["is_fragment"] = gene.is_fragment gene_row["local"] = gene.local_identifier - gene_row["contig"] = gene.contig.name + gene_row["contig"] = gene.contig.ID genedata = get_genedata(gene) genedata_id = genedata2gene.get(genedata) if genedata_id is None: @@ -163,7 +164,7 @@ def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Grou return genedata2gene -def rna_desc(id_len: int, max_contig_len: int) -> Dict[str, Union[tables.StringCol, tables.UInt32Col]]: +def rna_desc(id_len: int) -> Dict[str, Union[tables.StringCol, tables.UInt32Col]]: """Table description to save rna-related information :param id_len: Maximum size of RNA identifier @@ -173,7 +174,7 @@ def rna_desc(id_len: int, max_contig_len: int) -> Dict[str, Union[tables.StringC """ return {'ID': tables.StringCol(itemsize=id_len), 'genedata_id': tables.UInt32Col(), - 'contig': tables.StringCol(itemsize=max_contig_len)} + 'contig': tables.UInt32Col()} def write_rnas(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, @@ -196,7 +197,7 @@ def write_rnas(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group rna_row = rna_table.row for rna in tqdm(pangenome.RNAs, total=pangenome.number_of_rnas, unit="RNA", disable=disable_bar): rna_row["ID"] = rna.ID - rna_row["contig"] = rna.contig.name + rna_row["contig"] = rna.contig.ID genedata = get_genedata(rna) genedata_id = genedata2rna.get(genedata) if genedata_id is None: @@ -336,11 +337,11 @@ def write_annotations(pangenome: Pangenome, h5f: tables.File, rec_organisms: boo desc = contig_desc(contig_len, org_len) write_contigs(pangenome, h5f, annotation, desc, disable_bar) if rec_genes: - desc = gene_desc(gene_id_len, gene_local_id, contig_len) + desc = gene_desc(gene_id_len, gene_local_id) genedata2gene = write_genes(pangenome, h5f, annotation, desc, disable_bar) write_genedata(pangenome, h5f, annotation, genedata2gene, disable_bar) if rec_rnas: - desc = rna_desc(rna_id_len, contig_len) + desc = rna_desc(rna_id_len) genedata2rna = write_rnas(pangenome, h5f, annotation, desc, disable_bar) write_genedata(pangenome, h5f, annotation, genedata2rna, disable_bar) diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 9f3e0498..6965b0ff 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -299,12 +299,13 @@ class Contig: - RNAs: Set of RNA annotations present in the contig. """ - def __init__(self, name: str, is_circular: bool = False): + def __init__(self, identifier: int, name: str, is_circular: bool = False): """Constructor method :param name: Name of the contig :param is_circular: saves if the contig is circular """ + self.ID = identifier self.name = name self.is_circular = is_circular self._rna_getter = set() # Saving the rna annotations. We're not using them in the vast majority of cases. @@ -331,7 +332,9 @@ def __setitem__(self, start: int, gene: Gene): if not isinstance(gene, Gene): raise TypeError(f"'Gene' type was expected but you provided a '{type(gene)}' type object") if start in self._genes_getter: - raise ValueError(f"Gene '{self._genes_getter[start].ID}' with start position {start} already exists in the contig '{self.name}', cannot add gene '{gene.ID}'") + raise ValueError(f"Gene '{self._genes_getter[start].ID}' with start position {start} already exists in the " + f"contig '{self.name}' {f'from organism {self.organism}' if self.organism is not None else ''}, " + f"cannot add gene '{gene.ID}' {f'from organism {gene.organism}' if gene.organism is not None else ''}") if gene.position is None: raise AttributeError("The gene object needs to have its position in the contig filled before adding it") # Adding empty values. diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index b5bf7c88..1b78676d 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -322,28 +322,45 @@ def _mk_contig_getter(self): """ self._contig_getter = {} for contig in self.contigs: - self._contig_getter[contig.name] = contig + self._contig_getter[contig.ID] = contig - def get_contig(self, name: str) -> Contig: - """Returns the contig that has the given name + def get_contig(self, identifier: int = None, name: str = None, organism_name: str = None) -> Contig: + """Returns the contig by his identifier or by his name. If name is given the organism name is needed - :param name: The ,ame of the contig to look for + :param identifier: ID of the contig to look for + :param name: The name of the contig to look for + :param organism_name: Name of the organism to which the contig belong :return: Returns the wanted contig :raises AssertionError: If the `gene_id` is not an integer :raises KeyError: If the `gene_id` is not in the pangenome """ - assert isinstance(name, str), "Contig name should be a string" - - try: - return self._contig_getter[name] - except AttributeError: - # in that case, either the gene getter has not been computed, or the geneID is not in the pangenome. - self._mk_contig_getter() # make it - return self.get_contig(name) # Return what was expected. If geneID does not exist it will raise an error. - except KeyError: - raise KeyError(f"Contig: {name}, does not exist in the pangenome.") + if identifier is None: + if name is None: + raise ValueError("Neiher identifier or name of the contig are given.") + else: + if not isinstance(name, str): + raise AssertionError("Contig name should be a string") + + if organism_name is None: + raise ValueError("You should provide the name of the organism to which the contig belong") + else: + if not isinstance(organism_name, str): + raise AssertionError("Organism name should be a string") + organism = self.get_organism(organism_name) + return organism.get(name) + else: + if not isinstance(identifier, int): + raise AssertionError("Contig ID should be an integer") + try: + return self._contig_getter[identifier] + except AttributeError: + # in that case, either the gene getter has not been computed, or the geneID is not in the pangenome. + self._mk_contig_getter() # make it + return self.get_contig(identifier) # Return what was expected. If geneID does not exist it will raise an error. + except KeyError: + raise KeyError(f"Contig: {identifier}, does not exist in the pangenome.") def get_organism(self, name: str) -> Organism: """ Get an organism that is expected to be in the pangenome using its name, which is supposedly unique. diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 156ae4b7..c2643c4b 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -3,7 +3,8 @@ # default libraries import argparse -from multiprocessing import get_context +from concurrent.futures import ProcessPoolExecutor +from multiprocessing import get_context, Value import logging import os import time @@ -24,7 +25,7 @@ # # local libraries from ppanggolin.annotate.synta import read_fasta, get_dna_sequence -from ppanggolin.annotate.annotate import launch_read_anno, launch_annotate_organism, local_identifiers_are_unique +from ppanggolin.annotate.annotate import init_contig_counter, read_anno_file, annotate_organism, local_identifiers_are_unique from ppanggolin.annotate import subparser as annotate_subparser from ppanggolin.pangenome import Pangenome from ppanggolin.utils import detect_filetype, create_tmpdir, read_compressed_or_not, write_compressed_or_not, restricted_float, mk_outdir, get_config_args, parse_config_file, get_default_args, check_input_files @@ -207,13 +208,13 @@ def launch(args: argparse.Namespace): write_summaries(organism_2_summary, output_dir) -def annotate_fasta_files(genome_name_to_fasta_path: Dict[str,dict], tmpdir: str, cpu: int = 1, translation_table: int = 11, +def annotate_fasta_files(genome_name_to_fasta_path: Dict[str, dict], tmpdir: str, cpu: int = 1, translation_table: int = 11, kingdom: str = "bacteria", norna: bool = False, allow_overlap: bool = False, procedure: str = None, disable_bar: bool = False): """ Main function to annotate a pangenome - :param genome_name_to_annot_path: + :param genome_name_to_fasta_path: :param fasta_list: List of fasta file containing sequences that will be base of pangenome :param tmpdir: Path to temporary directory :param cpu: number of CPU cores to use @@ -233,18 +234,23 @@ def annotate_fasta_files(genome_name_to_fasta_path: Dict[str,dict], tmpdir: str, norna, kingdom, allow_overlap, procedure)) logging.getLogger("PPanGGOLiN").info(f"Annotating {len(arguments)} genomes using {cpu} cpus...") - with get_context('fork').Pool(processes=cpu) as p: - for organism in tqdm(p.imap_unordered(launch_annotate_organism, arguments), unit="genome", - total=len(arguments), disable=disable_bar): - - organisms.append(organism) - p.close() - p.join() + contig_counter = Value('i', 0) + with ProcessPoolExecutor(mp_context=get_context('fork'), max_workers=cpu, + initializer=init_contig_counter, initargs=(contig_counter,)) as executor: + with tqdm(total=len(arguments), unit="file", disable=disable_bar) as progress: + futures = [] + + for fn_args in arguments: + future = executor.submit(annotate_organism, *fn_args) + future.add_done_callback(lambda p: progress.update()) + futures.append(future) + + for future in futures: + organisms.append(future.result()) return organisms - def read_annotation_files(genome_name_to_annot_path: Dict[str,dict], cpu: int = 1, pseudo: bool = False, disable_bar: bool = False) -> Tuple[List[Organism], Dict[Organism,bool]]: """ @@ -264,17 +270,24 @@ def read_annotation_files(genome_name_to_annot_path: Dict[str,dict], cpu: int = # unless a gff file without fasta is met (which is the only case where sequences can be absent) org_to_has_fasta_flag = {} + args = [(org_name, org_info['path'], org_info['circular_contigs'], pseudo) + for org_name, org_info in genome_name_to_annot_path.items()] - for org_name, org_info in genome_name_to_annot_path.items(): - - args.append((org_name, org_info['path'], org_info['circular_contigs'], pseudo)) + contig_counter = Value('i', 0) + with ProcessPoolExecutor(mp_context=get_context('fork'), max_workers=cpu, + initializer=init_contig_counter, initargs=(contig_counter,)) as executor: + with tqdm(total=len(args), unit="file", disable=disable_bar) as progress: + futures = [] - with get_context('fork').Pool(cpu) as p: - for org, has_fasta in tqdm(p.imap_unordered(launch_read_anno, args), unit="file", total=len(args), - disable=disable_bar): - - organisms.append(org) - org_to_has_fasta_flag[org] = has_fasta + for fn_args in args: + future = executor.submit(read_anno_file, *fn_args) + future.add_done_callback(lambda p: progress.update()) + futures.append(future) + + for future in futures: + org, has_fasta = future.result() + organisms.append(org) + org_to_has_fasta_flag[org] = has_fasta genes = (gene for org in organisms for gene in org.genes) diff --git a/tests/region/test_rgp_cluster.py b/tests/region/test_rgp_cluster.py index d6913d30..b1ded836 100644 --- a/tests/region/test_rgp_cluster.py +++ b/tests/region/test_rgp_cluster.py @@ -16,7 +16,7 @@ def genes() -> Generator[Set[Gene], None, None]: """Create a set of genes to fill gene families """ organism = Organism("organism") - contig = Contig("contig") + contig = Contig(0, "contig") genes = set() for i in range(0, randint(11, 20)): gene = Gene(f"gene_{str(i)}") diff --git a/tests/test_genefamily.py b/tests/test_genefamily.py index 9baddb00..e1eefd3f 100644 --- a/tests/test_genefamily.py +++ b/tests/test_genefamily.py @@ -120,9 +120,11 @@ def organisms(self, genes) -> Generator[Set[Organism], None, None]: nb_organisms = randint(2, 10) nb_genes_per_organisms = len(genes) // nb_organisms idx_org = 1 + contig_counter = 0 while idx_org < nb_organisms: organism = Organism(f"organism_{idx_org}") - contig = Contig(f"contig_{idx_org}") + contig = Contig(contig_counter, f"contig_{idx_org}") + contig_counter organism.add(contig) idx_genes = 0 while idx_genes < nb_genes_per_organisms: @@ -134,7 +136,7 @@ def organisms(self, genes) -> Generator[Set[Organism], None, None]: idx_org += 1 # last family fill with all the gene left organism = Organism(f"organism_{idx_org}") - contig = Contig(f"contig_{idx_org}") + contig = Contig(contig_counter, f"contig_{idx_org}") organism.add(contig) idx_genes = (idx_org - 1) * nb_genes_per_organisms while idx_genes < len(genes): diff --git a/tests/test_genome.py b/tests/test_genome.py index c464bef1..35ac4714 100644 --- a/tests/test_genome.py +++ b/tests/test_genome.py @@ -92,7 +92,7 @@ def test_fill_parents(self, feature): """Tests that 'fill_parents' method associates the object with the given organism and contig """ organism = Organism('org_id') - contig = Contig('contig_name') + contig = Contig(0, 'contig_name') feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 'product', 'local_id') feature.fill_parents(organism, contig) assert feature.organism == organism @@ -102,7 +102,7 @@ def test_fill_parents_with_organism_or_contig_only(self, feature): """Tests that Gene can be filled with only an organism or a contig """ organism = Organism('org') - contig = Contig("ctg") + contig = Contig(0, "ctg") feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 'product', 'local_id') feature.fill_parents(organism=organism) assert feature.organism == organism @@ -131,7 +131,7 @@ def test_set_organism_not_isinstance_organism(self, feature): def test_set_contig(self, feature): """Tests that contig setter sets contig with the valid type """ - contig = Contig('contig') + contig = Contig(0, 'contig') feature.contig = contig assert feature.contig == contig @@ -268,7 +268,7 @@ class TestContig: def contig(self) -> Generator[Contig, None, None]: """Generate basic contig for tests """ - yield Contig("contig") + yield Contig(0, "contig") @pytest.fixture def gene(self) -> Generator[Gene, None, None]: @@ -446,7 +446,7 @@ def organism(self) -> Generator[Organism, None, None]: def contig(self) -> Generator[Contig, None, None]: """Generate a basic contig for test """ - yield Contig("contig") + yield Contig(0, "contig") @pytest.fixture def gene(self) -> Generator[Gene, None, None]: @@ -498,7 +498,7 @@ def test_add_contig_existing_name(self, organism, contig): """ organism.add(contig) with pytest.raises(KeyError): - organism.add(Contig('contig')) + organism.add(Contig(0, 'contig')) def test_get_contig(self, organism, contig): """Tests that a contig can be retrieved from an Organism instance @@ -521,8 +521,8 @@ def test_get_nonexistent_contig(self, organism): def test_number_of_contigs(self, organism): """Tests that the number of contigs in an organism instance can be retrieved """ - organism.add(Contig('contig1')) - organism.add(Contig('contig2')) + organism.add(Contig(1, 'contig1')) + organism.add(Contig(2, 'contig2')) assert organism.number_of_contigs == 2 assert isinstance(len(organism), int) diff --git a/tests/test_pangenome.py b/tests/test_pangenome.py index 6a4d7e72..8b1123ad 100644 --- a/tests/test_pangenome.py +++ b/tests/test_pangenome.py @@ -335,8 +335,8 @@ def fill_org_with_genes(self) -> Generator[Union[Organism, Set[Gene]], None, Non """ genes = set() organism = Organism(name="organism") - for contig_id in range(randint(2, 10)): - contig = Contig("k_{}".format(contig_id)) + for ctg_counter, contig_id in enumerate(range(randint(2, 10))): + contig = Contig(ctg_counter, "k_{}".format(contig_id)) organism.add(contig) for gene_idx in range(randint(2, 10)): gene = Gene(gene_id=f"{organism.name}.{contig_id}.{gene_idx}") @@ -455,8 +455,8 @@ def make_gene_pair(gene_id_1: int = 1, gene_id_2: int = 2) -> Tuple[Gene, Gene]: gene2 = Gene(gene_id=f"gene_{gene_id_2}") fam1 = GeneFamily(family_id=1, name=f"fam_{gene_id_1}") fam2 = GeneFamily(family_id=2, name=f"fam_{gene_id_2}") - ctg1 = Contig(name=f"ctg_{gene_id_1}") - ctg2 = Contig(name=f"ctg_{gene_id_2}") + ctg1 = Contig(1, name=f"ctg_{gene_id_1}") + ctg2 = Contig(2, name=f"ctg_{gene_id_2}") fam1.add(gene1) fam2.add(gene2) organism = Organism(name=f"org_{choices([gene_id_1, gene_id_2], k=1)}") @@ -808,7 +808,7 @@ def add_element_to_pangenome(self, pangenome): pangenome.add_gene_family(family) org = Organism("Org") org.add_metadata(source=metadata.source, metadata=metadata) - ctg = Contig("Ctg") + ctg = Contig(0, "Ctg") org.add(ctg) gene = Gene("Gene") gene.position, gene.start = (0, 0) diff --git a/tests/test_region.py b/tests/test_region.py index 16a4d68f..dbff76b0 100644 --- a/tests/test_region.py +++ b/tests/test_region.py @@ -145,9 +145,9 @@ def test_add_genes_from_different_contigs(self, region): gene1, gene2 = Gene('gene_1'), Gene('gene_2') gene1.fill_annotations(start=0, stop=10, strand='+', position=0) gene2.fill_annotations(start=11, stop=20, strand='+', position=1) - gene1.fill_parents(None, Contig('contig_1')) + gene1.fill_parents(None, Contig(1, 'contig_1')) region.add(gene1) - gene2.fill_parents(None, Contig('contig_2')) + gene2.fill_parents(None, Contig(2, 'contig_2')) with pytest.raises(Exception): region.add(gene2) @@ -223,7 +223,7 @@ def test_get_contig(self, region): """ gene = Gene('gene') gene.fill_annotations(start=0, stop=10, strand='+', position=0) - gene.fill_parents(contig=Contig("contig")) + gene.fill_parents(contig=Contig(0, "contig")) region.add(gene) assert region.contig.name == 'contig' @@ -233,7 +233,7 @@ def test_is_whole_contig_true(self, region): starter, stopper = Gene('starter'), Gene('stopper') starter.fill_annotations(start=0, stop=10, strand='+', position=0) stopper.fill_annotations(start=11, stop=20, strand='+', position=1) - contig = Contig("contig") + contig = Contig(0, "contig") contig[starter.start], contig[stopper.start] = starter, stopper starter.fill_parents(None, contig), stopper.fill_parents(None, contig) region.add(starter), region.add(stopper) @@ -247,7 +247,7 @@ def test_is_whole_contig_false(self, region): starter.fill_annotations(start=11, stop=20, strand='+', position=1) stopper.fill_annotations(start=21, stop=30, strand='+', position=2) after.fill_annotations(start=31, stop=40, strand='+', position=3) - contig = Contig("contig") + contig = Contig(0, "contig") contig[before.start], contig[after.start] = before, after contig[starter.start], contig[stopper.start] = starter, stopper before.fill_parents(None, contig), after.fill_parents(None, contig) @@ -263,7 +263,7 @@ def test_is_contig_border_true(self, region): starter.fill_annotations(start=11, stop=20, strand='+', position=1) stopper.fill_annotations(start=21, stop=30, strand='+', position=2) after.fill_annotations(start=31, stop=40, strand='+', position=3) - contig = Contig("contig") + contig = Contig(0, "contig") before.fill_parents(None, contig), after.fill_parents(None, contig) starter.fill_parents(None, contig), stopper.fill_parents(None, contig) # Test bordering right @@ -284,7 +284,7 @@ def test_is_contig_border_false(self, region): starter.fill_annotations(start=11, stop=20, strand='+', position=1) stopper.fill_annotations(start=21, stop=30, strand='+', position=2) after.fill_annotations(start=31, stop=40, strand='+', position=3) - contig = Contig("contig") + contig = Contig(0, "contig") contig[before.start], contig[after.start] = before, after contig[starter.start], contig[stopper.start] = starter, stopper before.fill_parents(None, contig), after.fill_parents(None, contig) From 7ba87e8261566695886ba1ad5f30fdfd2759a22c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 21 Sep 2023 18:26:58 +0200 Subject: [PATCH 131/173] Add metadata to contigs --- VERSION | 2 +- ppanggolin/formats/writeMetadata.py | 11 ++++++++++ ppanggolin/genome.py | 3 ++- ppanggolin/meta/meta.py | 31 ++++++++++++++++++++++++++--- ppanggolin/pangenome.py | 6 +++++- 5 files changed, 47 insertions(+), 6 deletions(-) diff --git a/VERSION b/VERSION index 3cff2901..1b38fdcf 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.189 +1.2.190 diff --git a/ppanggolin/formats/writeMetadata.py b/ppanggolin/formats/writeMetadata.py index 1d0ffa34..59c3d7c4 100644 --- a/ppanggolin/formats/writeMetadata.py +++ b/ppanggolin/formats/writeMetadata.py @@ -42,6 +42,9 @@ def write_metadata_status(pangenome: Pangenome, h5f: tables.File, status_group: if metastatus["genes"] in ["Computed", "Loaded", "inFile"]: metadata_group._v_attrs.genes = True metasources_group._v_attrs.genes = metasources["genes"] + if metastatus["contigs"] in ["Computed", "Loaded", "inFile"]: + metadata_group._v_attrs.contigs = True + metasources_group._v_attrs.contigs = metasources["contigs"] if metastatus["genomes"] in ["Computed", "Loaded", "inFile"]: metadata_group._v_attrs.genomes = True metasources_group._v_attrs.genomes = metasources["genomes"] @@ -238,6 +241,14 @@ def write_metadata(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = F "genomes", select_genomes, disable_bar) pangenome.status["metadata"]["genomes"] = "Loaded" + if pangenome.status["metadata"]["contigs"] == "Computed": + logging.getLogger().info("Writing contigs metadata in pangenome") + select_genomes = list(pangenome.get_elem_by_sources(source=pangenome.status["metasources"]["contigs"][-1], + metatype="contigs")) + write_metadata_metatype(h5f, pangenome.status["metasources"]["contigs"][-1], + "contigs", select_genomes, disable_bar) + pangenome.status["metadata"]["contigs"] = "Loaded" + if pangenome.status["metadata"]["genes"] == "Computed": logging.getLogger().info("Writing genes metadata in pangenome") select_genes = list(pangenome.get_elem_by_sources(source=pangenome.status["metasources"]["genes"][-1], diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 6965b0ff..b7e53fc6 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -285,7 +285,7 @@ def add_protein(self, protein: str): self.protein = protein -class Contig: +class Contig(MetaFeatures): """ Describe the contig content and some information Methods: @@ -305,6 +305,7 @@ def __init__(self, identifier: int, name: str, is_circular: bool = False): :param name: Name of the contig :param is_circular: saves if the contig is circular """ + super().__init__() self.ID = identifier self.name = name self.is_circular = is_circular diff --git a/ppanggolin/meta/meta.py b/ppanggolin/meta/meta.py index b07ea8c8..ba4a2589 100644 --- a/ppanggolin/meta/meta.py +++ b/ppanggolin/meta/meta.py @@ -64,7 +64,7 @@ def check_metadata_format(metadata: Path, metatype: str) -> pd.DataFrame: :return: Dataframe with metadata loaded """ - assert metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"] + assert metatype in ["families", "genomes", "contigs", "genes", "RGPs", "spots", "modules"] colname_check = re.compile('^[a-zA-Z_][a-zA-Z0-9_]*$') metadata_df = pd.read_csv(metadata, sep="\t", header=0, quoting=csv.QUOTE_NONE, dtype={metatype: str}) @@ -95,7 +95,25 @@ def assign_metadata(metadata_df: pd.DataFrame, pangenome: Pangenome, source: str :raise KeyError: element name is not find in pangenome :raise AssertionError: Metatype is not recognized """ - assert metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"] + assert metatype in ["families", "genomes", "contigs", "genes", "RGPs", "spots", "modules"] + + org2contig = None + + def check_duplicate_contig_name(): + contig_names = set() + for contig in pangenome.contigs: + old_len = len(contig_names) + contig_names.add(contig.name) + if len(contig_names) == old_len: + raise Exception("There are 2 contigs with the same name in the pangenome and " + "you did not provide the genome linked to contig. " + "Add a column 'genomes' to indicate to which genome the contig belongs to.") + + print(metadata_df.columns) + if metatype == "contigs" and "genomes" not in metadata_df.columns: + check_duplicate_contig_name() + org2contig = {contig.name: contig.organism.name for contig in pangenome.contigs} + for row in tqdm(metadata_df.iterrows(), unit='row', total=metadata_df.shape[0], disable=disable_bar): row = row[1] @@ -104,6 +122,10 @@ def assign_metadata(metadata_df: pd.DataFrame, pangenome: Pangenome, source: str element = pangenome.get_gene_family(row[metatype]) elif metatype == "genomes": element = pangenome.get_organism(row[metatype]) + elif metatype == "contigs": + org = row["genomes"] if "genomes" in metadata_df.columns else org2contig[row[metatype]] + print("pika") + element = pangenome.get_contig(name=row[metatype], organism_name=org) elif metatype == "genes": element = pangenome.get_gene(row[metatype]) elif metatype == "RGPs": @@ -119,7 +141,10 @@ def assign_metadata(metadata_df: pd.DataFrame, pangenome: Pangenome, source: str logging.getLogger().debug(f"{metatype}: {row[metatype]} doesn't exist") else: meta = Metadata(source=source, **{k: v for k, v in row.to_dict().items() if k != metatype}) + if metatype == "contigs": + meta.genomes = element.organism.name element.add_metadata(source=source, metadata=meta) + print(element.metadata) pangenome.status["metadata"][metatype] = "Computed" pangenome.status["metasources"][metatype].append(source) @@ -169,7 +194,7 @@ def parser_meta(parser: argparse.ArgumentParser): required.add_argument("-s", "--source", required=False, type=str, nargs="?", help='Name of the metadata source') required.add_argument("-a", "--assign", required=False, type=str, nargs="?", - choices=["families", "genomes", "genes", "RGPs", "spots", "modules"], + choices=["families", "genomes", "contigs", "genes", "RGPs", "spots", "modules"], help="Select to which pangenome element metadata will be assigned") optional = parser.add_argument_group(title="Optional arguments") optional.add_argument("--omit", required=False, action="store_true", diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index 1b78676d..bf72e1f8 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -51,12 +51,14 @@ def __init__(self): 'modules': 'No', "metadata": {"families": 'No', "genes": 'No', + "contigs": 'No', "genomes": 'No', "RGPs": 'No', "spots": 'No', "modules": 'No'}, "metasources": {"families": [], "genes": [], + "contigs": [], "genomes": [], "RGPs": [], "spots": [], @@ -697,6 +699,8 @@ def select_elem(self, metatype: str): return self.gene_families elif metatype == "genomes": return self.organisms + elif metatype == "contigs": + return self.contigs elif metatype == "genes": return self.genes elif metatype == "RGPs": @@ -754,7 +758,7 @@ def get_elem_by_sources(self, source: List[str], metatype: str) -> Generator[ :return: Gene families with the source """ - assert metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"] + assert metatype in ["families", "genomes", "contigs", "genes", "RGPs", "spots", "modules"] for elem in self.select_elem(metatype): if elem.get_metadata_by_source(source) is not None: yield elem From 1dab861ca57088675f4df3b8d5b260927cc9be15 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 25 Sep 2023 11:59:57 +0200 Subject: [PATCH 132/173] adapt proksee panorama code to ppanggolin --- ppanggolin/formats/proksee_template.json | 74 ++++++ ppanggolin/formats/writeFlat.py | 38 ++- ppanggolin/formats/write_proksee.py | 305 +++++++++++++++++++++++ ppanggolin/genome.py | 18 +- 4 files changed, 425 insertions(+), 10 deletions(-) create mode 100644 ppanggolin/formats/proksee_template.json create mode 100644 ppanggolin/formats/write_proksee.py diff --git a/ppanggolin/formats/proksee_template.json b/ppanggolin/formats/proksee_template.json new file mode 100644 index 00000000..a6952dbd --- /dev/null +++ b/ppanggolin/formats/proksee_template.json @@ -0,0 +1,74 @@ +{ + "cgview": { + "settings":{ + "format":"circular", + "geneticCode":11, + "backgroundColor":"rgba(255,255,255,1)", + "showShading":true, + "arrowHeadLength":0.3, + "minArcLength":1, + "initialMapThicknessProportion":0.1, + "maxMapThicknessProportion":0.5}, + "backbone":{"color":"rgba(128,128,128,1)", + "colorAlternate":"rgba(200,200,200,1)", + "thickness":5, + "decoration":"arrow" + }, + "ruler": { + "font":"sans-serif,plain,10", + "color":"rgba(0,0,0,1)"}, + "annotation":{"font":"monospace,plain,12", + "onlyDrawFavorites":false, + "visible":true + }, + "dividers":{ + "track": { + "color": "rgba(50,50,50,1)", + "thickness": 1, + "spacing": 1 + }, + "slot":{ + "visible":true, + "color":"rgba(0,0,0,1)", + "thickness":1, + "spacing":1 + } + }, + "highlighter":{ + "visible":true + }, + "captions":[ + { + "position":"bottom-center", + "textAlignment":"center", + "font":"sans-serif,plain,24", + "fontColor":"rgba(0,0,0,1)", + "backgroundColor":"rgba(255,255,255,0.4)" + } + ], + "legend":{ + "position":"top-right", + "textAlignment":"left", + "defaultFont":"sans-serif,plain,14", + "defaultFontColor":"rgba(0,0,0,1)", + "backgroundColor":"rgba(255,255,255,0.75)" + }, + "sequence": { + "color": "rgb(0,0,0)", + "font": "sans-serif,plain,14" + }, + "bookmarks": [ + { + "bbOffset": 86.75, + "bp": 5617, + "favorite": false, + "format": "circular", + "name": "Bookmark-1", + "shortcut": "1", + "zoom": 1.685 + } + ], + "plots": [ + ] + } +} \ No newline at end of file diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index c4d85ff3..84ae194e 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -7,7 +7,7 @@ from multiprocessing import get_context from collections import Counter, defaultdict import logging -from typing import TextIO, Dict +from typing import TextIO,List, Dict from pathlib import Path import pkg_resources from statistics import median, mean, stdev @@ -21,7 +21,7 @@ from ppanggolin.pangenome import Pangenome from ppanggolin.utils import write_compressed_or_not, mk_outdir, restricted_float from ppanggolin.formats.readBinaries import check_pangenome_info - +from ppanggolin.formats.write_proksee import write_proksee_organism # global variable to store the pangenome pan = Pangenome() # TODO change to pangenome:Pangenome = Pangenome=() ? needAnnotations = False @@ -617,6 +617,23 @@ def write_projections(output: Path, compress: bool = False): write_org_file(org, outdir, compress) logging.getLogger("PPanGGOLiN").info("Done writing the projection files") + +def write_proksee(output: Path, compress: bool = False): + """ + """ + + features = ["all"] + template = Path(__file__).parent.joinpath("proksee_template").with_suffix(".json") + + organism_with_rgp = {rgp.organism for rgp in pan.regions} + + for organism in organism_with_rgp : #pan.organisms: + write_proksee_organism(pan, organism, output, + template, features) + + + + def write_gff(output: str, compress: bool = False): """ Write the gff files for all organisms @@ -660,7 +677,6 @@ def write_gff_file(org: Organism, contig_to_rgp: Dict[Contig, Region], :param outdir: Path to the output directory where the GFF file will be written. :param compress: If True, compress the output GFF file using .gz format. :param annotation_sources: A dictionary that maps types of features to their source information. - :type annotation_sources: Dict[str, str] """ @@ -1060,7 +1076,7 @@ def write_rgp_modules(output: Path, compress: bool = False): def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core: float = 0.95, dup_margin: float = 0.05, csv: bool = False, gene_pa: bool = False, gexf: bool = False, - light_gexf: bool = False, projection: bool = False, gff: bool = False, stats: bool = False, json: bool = False, + light_gexf: bool = False, projection: bool = False, gff: bool = False, proksee: bool = False, stats: bool = False, json: bool = False, partitions: bool = False, regions: bool = False, families_tsv: bool = False, spots: bool = False, borders: bool = False, modules: bool = False, spot_modules: bool = False, compress: bool = False, disable_bar: bool = False): @@ -1091,7 +1107,7 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core :param disable_bar: Disable progress bar """ # TODO Add force parameter to check if output already exist - if not any(x for x in [csv, gene_pa, gexf, light_gexf, projection, gff, stats, json, partitions, regions, spots, borders, + if not any(x for x in [csv, gene_pa, gexf, light_gexf, projection, gff, proksee, stats, json, partitions, regions, spots, borders, families_tsv, modules, spot_modules]): raise Exception("You did not indicate what file you wanted to write.") @@ -1111,10 +1127,10 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core pan = pangenome if csv or gene_pa or gexf or light_gexf or projection or stats or json or partitions or regions or spots or \ - families_tsv or borders or modules or spot_modules or gff: + families_tsv or borders or modules or spot_modules or gff or proksee: needAnnotations = True needFamilies = True - if projection or stats or partitions or regions or spots or borders or gff: + if projection or stats or partitions or regions or spots or borders or gff or proksee: needPartitions = True if gexf or light_gexf or json: needGraph = True @@ -1132,7 +1148,7 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core needSpots = True if modules or spot_modules: # or projection: needModules = True - if projection or gff: + if projection or gff or proksee: needRegions = True if pan.status["predictedRGP"] == "inFile" else False needSpots = True if pan.status["spots"] == "inFile" else False needModules = True if pan.status["modules"] == "inFile" else False @@ -1155,6 +1171,8 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core processes.append(p.apply_async(func=write_projections, args=(output, compress))) if gff: processes.append(p.apply_async(func=write_gff, args=(output, compress))) + if proksee: + processes.append(p.apply_async(func=write_proksee, args=(output, compress))) if stats: processes.append(p.apply_async(func=write_stats, args=(output, soft_core, dup_margin, compress))) if json: @@ -1191,7 +1209,7 @@ def launch(args: argparse.Namespace): global pan pan.add_file(args.pangenome) write_flat_files(pan, args.output, cpu=args.cpu, soft_core=args.soft_core, dup_margin=args.dup_margin, csv=args.csv, - gene_pa=args.Rtab, gexf=args.gexf, light_gexf=args.light_gexf, projection=args.projection, gff=args.gff, + gene_pa=args.Rtab, gexf=args.gexf, light_gexf=args.light_gexf, projection=args.projection, gff=args.gff, proksee=args.proksee, stats=args.stats, json=args.json, partitions=args.partitions, regions=args.regions, families_tsv=args.families_tsv, spots=args.spots, borders=args.borders, modules=args.modules, spot_modules=args.spot_modules, compress=args.compress, disable_bar=args.disable_prog_bar) @@ -1242,6 +1260,8 @@ def parser_flat(parser: argparse.ArgumentParser): "on the organism") optional.add_argument("--gff", required=False, action="store_true", help="Generate a gff file for each organism containing pangenome annotations.") + optional.add_argument("--proksee", required=False, action="store_true", + help="Generate a json file for each organism containing pangenome annotations to be used to in proksee.") optional.add_argument("--stats", required=False, action="store_true", help="tsv files with some statistics for each organism and for each gene family") optional.add_argument("--partitions", required=False, action="store_true", diff --git a/ppanggolin/formats/write_proksee.py b/ppanggolin/formats/write_proksee.py new file mode 100644 index 00000000..5474b320 --- /dev/null +++ b/ppanggolin/formats/write_proksee.py @@ -0,0 +1,305 @@ +#!/usr/bin/env python3 +# coding:utf-8 + +# default libraries +from concurrent.futures import ThreadPoolExecutor +from datetime import datetime +import json +import logging +from pathlib import Path +from random import randint +from tqdm import tqdm +from typing import Dict, List, Tuple +from uuid import uuid4 + +# installed libraries +from bokeh.palettes import Category20 +from ppanggolin.genome import Organism, Contig, Gene +from ppanggolin.region import Spot + +# local libraries +from ppanggolin.pangenome import Pangenome + + +def palette() -> List[Tuple[int]]: + palette = [] + for hex_color in list(Category20[20]): + palette.append(tuple(int(hex_color.strip('#')[i:i + 2], 16) for i in (0, 2, 4))) + return palette + + +def read_settings(settings_data: dict): + if "format" not in settings_data: + settings_data["format"] = "circular" + if "geneticCode" not in settings_data: + # TODO Manage genetic code + settings_data["geneticCode"] = "11" + + +def write_legend_items(legend_data: dict, features: List[str]): #, sources: List[str]): + colors = palette() + legend_data["items"] = [#{"name": "CDS", "swatchColor": f"rgba({','.join(map(str, colors.pop(1)))},0.5)", "decoration": "arrow"}, + {"name": "persistent", "swatchColor": "rgba(229,156,4,1)", "decoration": "arrow"}, + {"name": "shell", "swatchColor": "rgba(60,254,91,1)", "decoration": "arrow"}, + {"name": "cloud", "swatchColor": f"rgba({','.join(map(str, colors.pop(17)))},1)", "decoration": "arrow"}, + {"name": "RNA", "swatchColor": "rgba(137,23,207,0.5)", "decoration": "arrow"},] + if "rgp" in features or "all" in features: + legend_data["items"].append({"name": "RGP", "swatchColor": f"rgba({','.join(map(str, colors.pop(6)))}, 1)", "decoration": "arc"}), + if "spots" in features or "all" in features: + legend_data["items"].append({"name": "Spot", "swatchColor": f"rgba({','.join(map(str, colors.pop(5)))}, 1)", "decoration": "arc"}) + if "modules" in features or "all" in features: + legend_data["items"].append({"name": "Module", "swatchColor": f"rgba({','.join(map(str, colors.pop(3)))},1)", "decoration": "arc"}) + # if "systems" in features or "all" in features: + # for source in sources: + # color = ','.join(map(str, colors.pop(randint(0, len(colors) - 1)))) + # legend_data["items"].append({"name": source, "decoration": "arc", "swatchColor": f"rgba({color},1)"}) + + +def write_tracks(features: List[str]): + + tracks = [{"name": "Gene", "separateFeaturesBy": "None", "position": "outside", "thicknessRatio": 1, + "dataType": "feature", "dataMethod": "source", "dataKeys": "Gene"}, + {"name": "Partition", "separateFeaturesBy": "strand", "position": "outside", "thicknessRatio": 1, + "dataType": "feature", "dataMethod": "source", "dataKeys": "partition"}] + + if "rgp" in features or "all" in features: + tracks.append({"name": "RGP", "separateFeaturesBy": "None", "position": "inside", "thicknessRatio": 1, + "dataType": "feature", "dataMethod": "source", "dataKeys": "RGP"}), + # if "spots" in features or "all" in features: + # tracks.append({"name": "Spots", "separateFeaturesBy": "None", "position": "inside", "thicknessRatio": 1, + # "dataType": "feature", "dataMethod": "source", "dataKeys": "Spot"}) + if "modules" in features or "all" in features: + tracks.append({"name": "Module", "separateFeaturesBy": "None", "position": "inside", "thicknessRatio": 1, + "dataType": "feature", "dataMethod": "source", "dataKeys": "Module"}) + + return tracks + + +def read_data(template: Path, features: List[str], sources: List[str] = None) -> dict: + """ + """ + + with open(template, "r") as template_file: + proksee_data = json.load(template_file) + + now = datetime.now() + + if "created" in proksee_data["cgview"]: + proksee_data["cgview"]["updated"] = now.strftime("%Y-%m-%d %H:%M:%S") + last_version = proksee_data["cgview"]["version"].split('.') + proksee_data["cgview"]["version"] = ".".join(last_version[:-1] + last_version[-1] + 1) + else: + proksee_data["cgview"]["created"] = now.strftime("%Y-%m-%d %H:%M:%S") + proksee_data["cgview"]["version"] = "1.0" + + if "name" not in proksee_data["cgview"]: + proksee_data["cgview"]["name"] = "PPanGGOLiN annotations at genome levels" + proksee_data["cgview"]["id"] = uuid4().hex + + read_settings(proksee_data["cgview"]["settings"]) + + if "items" not in proksee_data["cgview"]["legend"]: + write_legend_items(proksee_data["cgview"]["legend"], features) + + if "tracks" not in proksee_data["cgview"]: + proksee_data["cgview"]["tracks"] = write_tracks(features) + return proksee_data + + +def write_contig(organism: Organism): + contigs_data_list = [] + for contig in tqdm(organism.contigs, unit="contig", disable=True): + + contigs_data_list.append({"name": contig.name, + "length": contig.length, + "orientation": "+", # "seq": "".join([gene.dna for gene in contig.genes]) + }) + return contigs_data_list + + +def write_genes(organism: Organism, sources: List[str]=None): + genes_data_list = [] + gf2gene = {} + + for gene in tqdm(organism.genes, total=organism.number_of_genes(), unit="genes", disable=True): + + gf = gene.family + if gf.name in gf2gene: + gf2gene[gf.name].append(gene) + else: + gf2gene[gf.name] = [gene] + # annotations = {source: "|".join(list(map(str, gf.get_source(source)))) for source in gf.sources if + # source in sources} + genes_data_list.append({"name": gene.name, + "type": "Gene", + "contig": gene.contig.name, + "start": gene.start, + "stop": gene.stop, + "strand": 1 if gene.strand == "+" else -1, + "product": gene.product, + "tags": [gene.family.named_partition, gene.family.name], + "source": "Gene", + "legend": gene.family.named_partition, + "meta": ""#annotations + }) + + for gene in tqdm(organism.rna_genes, total=organism.number_of_rnas(), unit="rnas", disable=True): + + if gf.name in gf2gene: + gf2gene[gf.name].append(gene) + else: + gf2gene[gf.name] = [gene] + # annotations = {source: "|".join(list(map(str, gf.get_source(source)))) for source in gf.sources if + # source in sources} + genes_data_list.append({"name": gene.name, + "type": "Gene", + "contig": gene.contig.name, + "start": gene.start, + "stop": gene.stop, + "strand": 1 if gene.strand == "+" else -1, + "product": gene.product, + "tags": [], + "source": "Gene", + "legend": "RNA", + "meta": ""#annotations + }) + + + return genes_data_list, gf2gene + + +def write_partition(organism: Organism): + partition_data_list = [] + c=0 + for gene in tqdm(organism.genes, total=organism.number_of_genes(), unit="genes", disable=True): + c += 1 + partition_data_list.append({"name": gene.family.name, + "presence": gene.family.named_partition, + "contig": gene.contig.name, + "start": gene.start, + "stop": gene.stop, + "source": "partition", + "legend": gene.family.named_partition, + "tags": ["partition"]}) + + + return partition_data_list + + +def write_rgp(pangenome: Pangenome, organism: Organism): + rgp_data_list = [] + for rgp in tqdm(pangenome.regions, unit="RGP", disable=True): + if rgp.organism == organism: + rgp_data_list.append({"name": rgp.name, + "contig": rgp.contig.name, + "start": rgp.start, + "stop": rgp.stop, + "legend": "RGP", + 'source':"RGP", + "tags": []}) + return rgp_data_list + + +def write_spots(pangenome: Pangenome, organism: Organism, gf2genes: Dict[str, List[Gene]]): + spots_data_list = [] + for spot in tqdm(pangenome.spots, unit="Spot", disable=True): + spot: Spot + spot_orgs = set() + + for gf in spot.families: + spot_orgs |= set(gf.organisms) + + if organism in spot_orgs: + gf_intersection = set(organism.families) & set(spot.families) + completion = round(len(gf_intersection) / spot.number_of_families, 2) + + for gf in gf_intersection: + for gene in gf2genes[gf.name]: + spots_data_list.append({"name": f"Spot_{spot.ID}", + "start": gene.start, + "stop": gene.stop, + "contig": gene.contig.name, + "legend": "Spot", + "source":"Spot", + "tags": [], + "meta": { + "completion": completion + }}) + return spots_data_list + + +def write_modules(pangenome: Pangenome, organism: Organism, gf2genes: Dict[str, List[Gene]]): + modules_data_list = [] + for module in tqdm(pangenome.modules, unit="Module", disable=True): + mod_orgs = set() + for gf in module.families: + mod_orgs |= set(gf.organisms) + if organism in mod_orgs: + gf_intersection = set(organism.families) & set(module.families) + completion = round(len(gf_intersection) / len(set(module.families)), 2) + for gf in gf_intersection: + for gene in gf2genes[gf.name]: + modules_data_list.append({"name": f"Module_{module.ID}", + "presence": "Module", + "start": gene.start, + "stop": gene.stop, + "contig": gene.contig.name, + "legend": "Module", + "source": "Module", + "tags": [], + "meta": { + "completion": completion + }}) + return modules_data_list + + +def write_proksee_organism(pangenome: Pangenome, organism: Organism, output: Path, template: Path, + features: List[str] = None, sources: List[str] = None): + + proksee_data = read_data(template=template, features=features, sources=sources) + + if "name" not in proksee_data["cgview"]["captions"]: + proksee_data["cgview"]["captions"][0]["name"] = f"{organism.name} annotated with PPanGGOLiN" + + proksee_data["cgview"]["sequence"]["contigs"] = write_contig(organism) + + if "features" not in proksee_data["cgview"]: + proksee_data["cgview"]["features"] = [] + + genes_features, gf2genes = write_genes(organism, sources=sources) + print(len(genes_features)) + proksee_data["cgview"]["features"] += genes_features + proksee_data["cgview"]["features"] += write_partition(organism) + + if "rgp" in features or "all" in features: + proksee_data["cgview"]["features"] += write_rgp(pangenome=pangenome, organism=organism) + if "spots" in features or "all" in features: + proksee_data["cgview"]["features"] += write_spots(pangenome=pangenome, organism=organism, gf2genes=gf2genes) + if "modules" in features or "all" in features: + proksee_data["cgview"]["features"] += write_modules(pangenome=pangenome, organism=organism, gf2genes=gf2genes) + + logging.debug(f"Write proksee for {organism.name}") + with open(output.joinpath(organism.name).with_suffix(".json"), "w") as out_json: + json.dump(proksee_data, out_json, indent=2) + + +def write_proksee(pangenome: Pangenome, output: Path, features: List[str] = None, sources: List[str] = None, + template: Path = None, organisms_list: List[str] = None, threads: int = 1, disable_bar: bool = False): + assert features is not None + if template is None: + template = Path(__file__).parent.joinpath("proksee_template").with_suffix(".json") + if organisms_list is not None: + organisms = [organism for organism in pangenome.organisms if organism.name in organisms_list] + else: + organisms = pangenome.organisms + with ThreadPoolExecutor(max_workers=threads) as executor: + with tqdm(total=len(organisms), unit='organism', disable=disable_bar) as progress: + futures = [] + for organism in organisms: + future = executor.submit(write_proksee_organism, pangenome, organism, output, + template, features, sources) + future.add_done_callback(lambda p: progress.update()) + futures.append(future) + + for future in futures: + future.result() diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 0d3d8adb..69f9539e 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -655,12 +655,28 @@ def genes(self) -> Generator[Gene, None, None]: for contig in self.contigs: yield from contig.genes + @property + def rna_genes(self) -> Generator[RNA, None, None]: + """Generator to get genes in the organism + + :return: Generator of genes + """ + for contig in self.contigs: + yield from contig.RNAs + def number_of_genes(self) -> int: """ Get number of genes in the organism :return: Number of genes """ - return sum([contig.number_of_genes for contig in self.contigs]) + return sum((contig.number_of_genes for contig in self.contigs)) + + def number_of_rnas(self) -> int: + """ Get number of genes in the organism + + :return: Number of genes + """ + return sum((contig.number_of_rnas for contig in self.contigs)) @property def contigs(self) -> Generator[Contig, None, None]: From bffb2b862864f7e9c609f7f73551eea82a5d06fb Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 25 Sep 2023 17:36:41 +0200 Subject: [PATCH 133/173] improve proksee output --- ppanggolin/formats/writeFlat.py | 20 ++++++- ppanggolin/formats/write_proksee.py | 85 ++++++++++++++++++----------- ppanggolin/region.py | 12 ++++ 3 files changed, 82 insertions(+), 35 deletions(-) diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 84ae194e..10d0389d 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -621,15 +621,29 @@ def write_projections(output: Path, compress: bool = False): def write_proksee(output: Path, compress: bool = False): """ """ - + + org_to_modules = defaultdict(set) + for mod in pan.modules: + for org in mod.organisms: + org_to_modules[org].add(mod) + + features = ["all"] template = Path(__file__).parent.joinpath("proksee_template").with_suffix(".json") organism_with_rgp = {rgp.organism for rgp in pan.regions} for organism in organism_with_rgp : #pan.organisms: - write_proksee_organism(pan, organism, output, - template, features) + + if organism.name in ["GCA_018141505.1_ASM1814150v1_genomic.fna", + "GCA_018219365.1_ASM1821936v1_genomic.fna", + "GCA_003031305.1_ASM303130v1_genomic.fna", + "GCA_000808515.1_ASM80851v1_genomic.fna", + "GCA_003932035.1_ASM393203v1_genomic.fna"]: + + write_proksee_organism(pan, organism, output, + template, features=features, + modules=org_to_modules[organism]) diff --git a/ppanggolin/formats/write_proksee.py b/ppanggolin/formats/write_proksee.py index 5474b320..d14e3f7b 100644 --- a/ppanggolin/formats/write_proksee.py +++ b/ppanggolin/formats/write_proksee.py @@ -9,22 +9,24 @@ from pathlib import Path from random import randint from tqdm import tqdm -from typing import Dict, List, Tuple +from typing import Dict, List, Tuple, Set from uuid import uuid4 - +from itertools import cycle # installed libraries from bokeh.palettes import Category20 +from plotly.express.colors import qualitative + from ppanggolin.genome import Organism, Contig, Gene -from ppanggolin.region import Spot +from ppanggolin.region import Spot, Module # local libraries from ppanggolin.pangenome import Pangenome def palette() -> List[Tuple[int]]: - palette = [] - for hex_color in list(Category20[20]): - palette.append(tuple(int(hex_color.strip('#')[i:i + 2], 16) for i in (0, 2, 4))) + palette = qualitative.Vivid + qualitative.Pastel2 + qualitative.Pastel1 + qualitative.Antique + qualitative.Safe + qualitative.Bold + palette = cycle(palette) + return palette @@ -36,31 +38,47 @@ def read_settings(settings_data: dict): settings_data["geneticCode"] = "11" -def write_legend_items(legend_data: dict, features: List[str]): #, sources: List[str]): +def write_legend_items(legend_data: dict, features: List[str], modules: Set[Module]): #, sources: List[str]): + + + colors = palette() - legend_data["items"] = [#{"name": "CDS", "swatchColor": f"rgba({','.join(map(str, colors.pop(1)))},0.5)", "decoration": "arrow"}, - {"name": "persistent", "swatchColor": "rgba(229,156,4,1)", "decoration": "arrow"}, - {"name": "shell", "swatchColor": "rgba(60,254,91,1)", "decoration": "arrow"}, - {"name": "cloud", "swatchColor": f"rgba({','.join(map(str, colors.pop(17)))},1)", "decoration": "arrow"}, - {"name": "RNA", "swatchColor": "rgba(137,23,207,0.5)", "decoration": "arrow"},] + print(colors) + # use https://medialab.github.io/iwanthue/ to find nice colors + # that associate well with established partition colors (orange, light green, light blue) + main_colors = { + "orange": "#e59c04", + "light green": "#00d860" , + "light blue": "#79deff", + "purple": "#a567bb", + "dark green": "#7a9a4c", + "dark red": "#ca5c55", + } + + + legend_data["items"] = [ + {"name": "persistent", "swatchColor": main_colors['orange'], "decoration": "arrow"}, + {"name": "shell", "swatchColor": main_colors['light green'], "decoration": "arrow"}, + {"name": "cloud", "swatchColor": main_colors['light blue'], "decoration": "arrow"}, + {"name": "RNA", "swatchColor": main_colors['purple'], "decoration": "arrow"},] if "rgp" in features or "all" in features: - legend_data["items"].append({"name": "RGP", "swatchColor": f"rgba({','.join(map(str, colors.pop(6)))}, 1)", "decoration": "arc"}), - if "spots" in features or "all" in features: - legend_data["items"].append({"name": "Spot", "swatchColor": f"rgba({','.join(map(str, colors.pop(5)))}, 1)", "decoration": "arc"}) + legend_data["items"].append({"name": "RGP", "swatchColor": main_colors['dark green'], "decoration": "arc"}), + # if "spots" in features or "all" in features: + # legend_data["items"].append({"name": "Spot", "swatchColor": main_colors['dark red'], 1)", "decoration": "arc"}) + if "modules" in features or "all" in features: - legend_data["items"].append({"name": "Module", "swatchColor": f"rgba({','.join(map(str, colors.pop(3)))},1)", "decoration": "arc"}) - # if "systems" in features or "all" in features: - # for source in sources: - # color = ','.join(map(str, colors.pop(randint(0, len(colors) - 1)))) - # legend_data["items"].append({"name": source, "decoration": "arc", "swatchColor": f"rgba({color},1)"}) + if modules is None: + legend_data["items"].append({"name": "Module", "swatchColor": main_colors['dark red'], "decoration": "arc"}) + else: + for mod in modules: + legend_data["items"].append({"name": f"module_{mod.ID}", "decoration": "arc", "swatchColor": next(colors)}) def write_tracks(features: List[str]): - tracks = [{"name": "Gene", "separateFeaturesBy": "None", "position": "outside", "thicknessRatio": 1, - "dataType": "feature", "dataMethod": "source", "dataKeys": "Gene"}, - {"name": "Partition", "separateFeaturesBy": "strand", "position": "outside", "thicknessRatio": 1, - "dataType": "feature", "dataMethod": "source", "dataKeys": "partition"}] + tracks = [{"name": "Gene", "separateFeaturesBy": "strand", "position": "outside", "thicknessRatio": 1, + "dataType": "feature", "dataMethod": "source", "dataKeys": "Gene"}, # {"name": "Partition", "separateFeaturesBy": "strand", "position": "outside", "thicknessRatio": 1, "dataType": "feature", "dataMethod": "source", "dataKeys": "partition"} + ] if "rgp" in features or "all" in features: tracks.append({"name": "RGP", "separateFeaturesBy": "None", "position": "inside", "thicknessRatio": 1, @@ -75,7 +93,7 @@ def write_tracks(features: List[str]): return tracks -def read_data(template: Path, features: List[str], sources: List[str] = None) -> dict: +def read_data(template: Path, features: List[str], modules: List[str] = None) -> dict: """ """ @@ -99,7 +117,7 @@ def read_data(template: Path, features: List[str], sources: List[str] = None) -> read_settings(proksee_data["cgview"]["settings"]) if "items" not in proksee_data["cgview"]["legend"]: - write_legend_items(proksee_data["cgview"]["legend"], features) + write_legend_items(proksee_data["cgview"]["legend"], features, modules) if "tracks" not in proksee_data["cgview"]: proksee_data["cgview"]["tracks"] = write_tracks(features) @@ -244,7 +262,7 @@ def write_modules(pangenome: Pangenome, organism: Organism, gf2genes: Dict[str, "start": gene.start, "stop": gene.stop, "contig": gene.contig.name, - "legend": "Module", + "legend": "Module",#f"module_{module.ID}", "source": "Module", "tags": [], "meta": { @@ -254,9 +272,11 @@ def write_modules(pangenome: Pangenome, organism: Organism, gf2genes: Dict[str, def write_proksee_organism(pangenome: Pangenome, organism: Organism, output: Path, template: Path, - features: List[str] = None, sources: List[str] = None): + features: List[str] = None, modules: List[str] = None): - proksee_data = read_data(template=template, features=features, sources=sources) + + print(len(modules), "MODULES") + proksee_data = read_data(template=template, features=features, modules=None) if "name" not in proksee_data["cgview"]["captions"]: proksee_data["cgview"]["captions"][0]["name"] = f"{organism.name} annotated with PPanGGOLiN" @@ -266,15 +286,16 @@ def write_proksee_organism(pangenome: Pangenome, organism: Organism, output: Pat if "features" not in proksee_data["cgview"]: proksee_data["cgview"]["features"] = [] - genes_features, gf2genes = write_genes(organism, sources=sources) + genes_features, gf2genes = write_genes(organism, sources=None) + print(len(genes_features)) proksee_data["cgview"]["features"] += genes_features proksee_data["cgview"]["features"] += write_partition(organism) if "rgp" in features or "all" in features: proksee_data["cgview"]["features"] += write_rgp(pangenome=pangenome, organism=organism) - if "spots" in features or "all" in features: - proksee_data["cgview"]["features"] += write_spots(pangenome=pangenome, organism=organism, gf2genes=gf2genes) + # if "spots" in features or "all" in features: + # proksee_data["cgview"]["features"] += write_spots(pangenome=pangenome, organism=organism, gf2genes=gf2genes) if "modules" in features or "all" in features: proksee_data["cgview"]["features"] += write_modules(pangenome=pangenome, organism=organism, gf2genes=gf2genes) diff --git a/ppanggolin/region.py b/ppanggolin/region.py index ad13e945..6a43447d 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -720,6 +720,18 @@ def families(self) -> Generator[GeneFamily, None, None]: :return: Families belonging to the module """ yield from self._families_getter.values() + + @property + def organisms(self) -> Generator[Organism, None, None]: + """Returns all the Organisms that have this module + + :return: Organisms that have this module + """ + organisms = set() + for fam in self.families: + organisms |= set(fam.organisms) + return organisms + def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): """Produces a bitarray representing the presence / absence of families in the organism using the provided index From ad920cf73cd0ecc2220f3b29e694a130f54badec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Wed, 27 Sep 2023 17:44:17 +0200 Subject: [PATCH 134/173] Fix write compress GEXF file --- VERSION | 2 +- ppanggolin/formats/writeFlat.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/VERSION b/VERSION index 1b38fdcf..70c79534 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.190 +1.2.191 diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 25769e5b..aa3a13f0 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -263,10 +263,10 @@ def write_gexf_nodes(gexf: TextIO, light: bool = True, soft_core: False = 0.95): gexf.write(f' \n') if pan.number_of_spots > 0: str_spot = "|".join([str(s) for s in list(fam.spots)]) - gexf.write(f' \n') + gexf.write(f' \n') if pan.number_of_modules > 0: str_module = "|".join([str(m) for m in list(fam.modules)]) - gexf.write(f' \n') + gexf.write(f' \n') shift = 14 source_fields = {m.source: m.fields for f in pan.gene_families if len(list(f.metadata)) > 0 for m in f.metadata} for source_metadata_families in pan.metadata_sources("families"): @@ -336,7 +336,7 @@ def write_gexf(output: Path, light: bool = True, compress: bool = False): txt += "light gexf file for the pangenome graph..." if light else "gexf file for the pangenome graph..." logging.getLogger("PPanGGOLiN").info(txt) - outname = output / f"pangenomeGraph{'_light' if light else ''}.gexf" + outname = output / f"pangenomeGraph{'_light' if light else ''}.gexf{'.gz' if compress else ''}" with write_compressed_or_not(outname, compress) as gexf: graph_type = 'ligth gexf' if light else 'gexf' logging.getLogger("PPanGGOLiN").debug(f"Writing the {graph_type} header...") @@ -347,7 +347,7 @@ def write_gexf(output: Path, light: bool = True, compress: bool = False): write_gexf_edges(gexf, light) logging.getLogger("PPanGGOLiN").debug(f"Writing the {graph_type} ends...") write_gexf_end(gexf) - logging.getLogger("PPanGGOLiN").info(f"Done writing the gexf file : '{outname.as_posix()}'") + logging.getLogger("PPanGGOLiN").info(f"Done writing the gexf file : '{gexf.name}'") def write_matrix(output: Path, sep: str = ',', ext: str = 'csv', compress: bool = False, gene_names: bool = False): @@ -1012,9 +1012,9 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core if gene_pa: processes.append(p.apply_async(func=write_gene_presence_absence, args=(output, compress))) if gexf: - processes.append(p.apply_async(func=write_gexf, args=(output, False, soft_core))) + processes.append(p.apply_async(func=write_gexf, args=(output, False, compress))) if light_gexf: - processes.append(p.apply_async(func=write_gexf, args=(output, True, soft_core))) + processes.append(p.apply_async(func=write_gexf, args=(output, True, compress))) if projection: processes.append(p.apply_async(func=write_projections, args=(output, compress))) if stats: From 123dd484f3164e2982e13ab1ca36d0d047117c7f Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 29 Sep 2023 11:21:48 +0200 Subject: [PATCH 135/173] fix wrong computation introduce by unitest PR --- ppanggolin/formats/writeFlat.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index aa3a13f0..fc30dc08 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -534,16 +534,17 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, if gene.family in core: nb_gene_core += 1 completeness = "NA" + org_families = set(org.families) if len(single_copy_markers) > 0: - completeness = round((len(set(org.families) & single_copy_markers) / + completeness = round((len(org_families & single_copy_markers) / len(single_copy_markers)) * 100, 2) outfile.write("\t".join(map(str, [org.name, org.number_of_families(), nb_pers, nb_shell, nb_cloud, - len(core) + org.number_of_families(), - len(soft) + org.number_of_families(), + len(core & org_families), + len(soft & org_families), org.number_of_genes(), nb_gene_pers, nb_gene_shell, @@ -551,7 +552,7 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, nb_gene_core, nb_gene_soft, completeness, - org.number_of_families() + len(single_copy_markers)])) + "\n") + len(org_families & single_copy_markers)])) + "\n") logging.getLogger("PPanGGOLiN").info("Done writing genome per genome statistics") @@ -674,7 +675,7 @@ def write_gene_families_tsv(output: Path, compress: bool = False): tsv.write("\t".join([fam.name, gene.ID if gene.local_identifier == "" else gene.local_identifier, "F" if gene.is_fragment else ""]) + "\n") logging.getLogger("PPanGGOLiN").info("Done writing the file providing the association between genes and " - f"gene families : '{outname}'") + f"gene families: '{outname}'") def write_regions(output: Path, compress: bool = False): From d3299a0c3e41396e4148ced6c7f895e160731c40 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 29 Sep 2023 16:55:53 +0200 Subject: [PATCH 136/173] add completeness in projection --- ppanggolin/projection/projection.py | 39 +++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index c2643c4b..097b14f2 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -172,7 +172,6 @@ def launch(args: argparse.Namespace): if predict_rgp: logging.getLogger('PPanGGOLiN').info('Detecting RGPs in input genomes.') - logging.getLogger('PPanGGOLiN').debug("Detecting multigenic families...") multigenics = pangenome.get_multigenics(pangenome_params.rgp.dup_margin) input_org_2_rgps = predict_RGP(pangenome, organisms, persistent_penalty=pangenome_params.rgp.persistent_penalty, variable_gain=pangenome_params.rgp.variable_gain, @@ -197,15 +196,27 @@ def launch(args: argparse.Namespace): input_orgs_to_modules = project_and_write_modules(pangenome, organisms, output_dir) organism_2_summary = {} + # dup margin value here is specified in argument and is used to compute completeness. + # Thats mean it can be different than dup margin used in spot and RGPS. + single_copy_fams = set() + + for fam in pangenome.gene_families: + if fam.named_partition == "persistent": + dup = len([genes for genes in fam.get_org_dict().values() if + len([gene for gene in genes if not gene.is_fragment]) > 1]) + + if (dup / fam.number_of_organisms) < args.dup_margin: + single_copy_fams.add(fam) + for organism in organisms: # summarize projection for all input organisms - organism_2_summary[organism] = summarize_projection(organism, pangenome, + organism_2_summary[organism] = summarize_projection(organism, pangenome, single_copy_fams, input_org_2_rgps.get(organism, None), input_org_to_spots.get(organism, None), input_orgs_to_modules.get(organism, None), - input_org_to_lonely_genes_count[organism], output_dir) + input_org_to_lonely_genes_count[organism]) - write_summaries(organism_2_summary, output_dir) + write_summaries(organism_2_summary, output_dir) def annotate_fasta_files(genome_name_to_fasta_path: Dict[str, dict], tmpdir: str, cpu: int = 1, translation_table: int = 11, @@ -434,8 +445,8 @@ def write_summaries(organism_2_summary: Dict[Organism, Dict[str, Any]], output_d df_summary.to_csv(output_dir / "summary_projection.tsv", sep='\t', index=False) -def summarize_projection(input_organism:Organism, pangenome:Pangenome, input_org_rgps:Region, - input_org_spots:Spot, input_org_modules:Module, singleton_gene_count:int, output_dir:Path): +def summarize_projection(input_organism:Organism, pangenome:Pangenome, single_copy_families:Set, input_org_rgps:Region, + input_org_spots:Spot, input_org_modules:Module, singleton_gene_count:int): """ :param singleton_gene_count: Number of genes that do not cluster with any of the gene families of the pangenome. @@ -452,6 +463,13 @@ def summarize_projection(input_organism:Organism, pangenome:Pangenome, input_org persistent_gene_count = len(partition_to_gene['persistent']) shell_gene_count = len(partition_to_gene['shell']) cloud_gene_count = len(partition_to_gene['cloud']) + + completeness = "NA" + + single_copy_markers_count = len(set(input_organism.families) & single_copy_families ) + if len(single_copy_families) > 0: + completeness = round((single_copy_markers_count / + len(single_copy_families)) * 100, 2) gene_count = persistent_gene_count + shell_gene_count + cloud_gene_count @@ -475,6 +493,8 @@ def summarize_projection(input_organism:Organism, pangenome:Pangenome, input_org "Persistent": {"genes":persistent_gene_count, "families":persistent_family_count}, "Shell": {"genes":shell_gene_count, "families":shell_family_count}, "Cloud": {"genes":cloud_gene_count, "families":cloud_family_count - singleton_gene_count, "specific families":singleton_gene_count}, + "Completeness":completeness, + "Single copy markers":single_copy_markers_count, "RGPs": rgp_count, "Spots": spot_count, "New spots": new_spot_count, @@ -1214,7 +1234,12 @@ def parser_projection(parser: argparse.ArgumentParser): optional.add_argument("--use_pseudo", required=False, action="store_true", help="In the context of provided annotation, use this option to read pseudogenes. " "(Default behavior is to ignore them)") - + + optional.add_argument("--dup_margin", required=False, type=restricted_float, default=0.05, + help="minimum ratio of organisms in which the family must have multiple genes " + "for it to be considered 'duplicated'. " + "This metric is used to compute completeness and duplication of the input genomes") + optional.add_argument("--spot_graph", required=False, action="store_true", help="Write the spot graph to a file, with pairs of blocks of single copy markers flanking RGPs " "as nodes. This graph can be used to visualize nodes that have RGPs from the input organism.") From 7bcc7a44d937101c784014b087378c26433f045c Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 2 Oct 2023 11:35:11 +0200 Subject: [PATCH 137/173] add possibility to add sequences --- ppanggolin/formats/writeFlat.py | 59 +++++++++++++++++++++-------- ppanggolin/formats/write_proksee.py | 55 +++++++++++---------------- 2 files changed, 66 insertions(+), 48 deletions(-) diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 10d0389d..38cdd75e 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -19,9 +19,10 @@ from ppanggolin.genome import Organism, Gene, Contig, RNA from ppanggolin.region import Region, Spot from ppanggolin.pangenome import Pangenome -from ppanggolin.utils import write_compressed_or_not, mk_outdir, restricted_float +from ppanggolin.utils import write_compressed_or_not, mk_outdir, restricted_float, read_compressed_or_not from ppanggolin.formats.readBinaries import check_pangenome_info from ppanggolin.formats.write_proksee import write_proksee_organism +from ppanggolin.formats.writeSequences import read_genome_file # global variable to store the pangenome pan = Pangenome() # TODO change to pangenome:Pangenome = Pangenome=() ? needAnnotations = False @@ -618,9 +619,24 @@ def write_projections(output: Path, compress: bool = False): logging.getLogger("PPanGGOLiN").info("Done writing the projection files") -def write_proksee(output: Path, compress: bool = False): +def write_proksee(output: Path, compress: bool = False, fasta = None, anno = None): """ """ + # TODO improve this part by using fct created in projection to read such file + + organisms_file = fasta if fasta is not None else anno + + if organisms_file: + org_dict = {} + for line in read_compressed_or_not(organisms_file): + elements = [el.strip() for el in line.split("\t")] + if len(elements) <= 1: + raise Exception(f"No tabulation separator found in given --fasta or --anno file: '{organisms_file}'") + org_dict[elements[0]] = Path(elements[1]) + if not org_dict[elements[0]].exists(): # Check tsv sanity test if it's not one it's the other + org_dict[elements[0]] = organisms_file.parent.joinpath(org_dict[elements[0]]) + + org_to_modules = defaultdict(set) for mod in pan.modules: @@ -635,17 +651,17 @@ def write_proksee(output: Path, compress: bool = False): for organism in organism_with_rgp : #pan.organisms: - if organism.name in ["GCA_018141505.1_ASM1814150v1_genomic.fna", - "GCA_018219365.1_ASM1821936v1_genomic.fna", - "GCA_003031305.1_ASM303130v1_genomic.fna", - "GCA_000808515.1_ASM80851v1_genomic.fna", - "GCA_003932035.1_ASM393203v1_genomic.fna"]: + if organisms_file: + genome_sequences = read_genome_file(org_dict, organism.name) + else: + genome_sequences = None - write_proksee_organism(pan, organism, output, - template, features=features, - modules=org_to_modules[organism]) - - + write_proksee_organism(pan, organism, output, + template, features=features, + modules=org_to_modules[organism], + genome_sequences=genome_sequences) + + def write_gff(output: str, compress: bool = False): @@ -1093,7 +1109,7 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core light_gexf: bool = False, projection: bool = False, gff: bool = False, proksee: bool = False, stats: bool = False, json: bool = False, partitions: bool = False, regions: bool = False, families_tsv: bool = False, spots: bool = False, borders: bool = False, modules: bool = False, spot_modules: bool = False, compress: bool = False, - disable_bar: bool = False): + disable_bar: bool = False, fasta=None, anno=None): """ Main function to write flat files from pangenome @@ -1186,7 +1202,7 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core if gff: processes.append(p.apply_async(func=write_gff, args=(output, compress))) if proksee: - processes.append(p.apply_async(func=write_proksee, args=(output, compress))) + processes.append(p.apply_async(func=write_proksee, args=(output, compress, fasta, anno))) if stats: processes.append(p.apply_async(func=write_stats, args=(output, soft_core, dup_margin, compress))) if json: @@ -1226,7 +1242,7 @@ def launch(args: argparse.Namespace): gene_pa=args.Rtab, gexf=args.gexf, light_gexf=args.light_gexf, projection=args.projection, gff=args.gff, proksee=args.proksee, stats=args.stats, json=args.json, partitions=args.partitions, regions=args.regions, families_tsv=args.families_tsv, spots=args.spots, borders=args.borders, modules=args.modules, - spot_modules=args.spot_modules, compress=args.compress, disable_bar=args.disable_prog_bar) + spot_modules=args.spot_modules, compress=args.compress, disable_bar=args.disable_prog_bar, fasta=args.fasta, anno=args.anno) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -1297,6 +1313,19 @@ def parser_flat(parser: argparse.ArgumentParser): optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") + context = parser.add_argument_group(title="Contextually required arguments", + description="With --proksee and -gff, the following arguments can be " + "used to add sequence information to the output file:") + + context.add_argument('--fasta', required=False, type=Path, + help="A tab-separated file listing the organism names, and the fasta filepath of its genomic " + "sequence(s) (the fastas can be compressed with gzip). One line per organism.") + + context.add_argument('--anno', required=False, type=Path, + help="A tab-separated file listing the organism names, and the gff/gbff filepath of its " + "annotations (the files can be compressed with gzip). One line per organism. " + "If this is provided, those annotations will be used.") + if __name__ == '__main__': """To test local change and allow using debugger""" from ppanggolin.utils import set_verbosity_level, add_common_arguments diff --git a/ppanggolin/formats/write_proksee.py b/ppanggolin/formats/write_proksee.py index d14e3f7b..d87b43f1 100644 --- a/ppanggolin/formats/write_proksee.py +++ b/ppanggolin/formats/write_proksee.py @@ -43,7 +43,6 @@ def write_legend_items(legend_data: dict, features: List[str], modules: Set[Modu colors = palette() - print(colors) # use https://medialab.github.io/iwanthue/ to find nice colors # that associate well with established partition colors (orange, light green, light blue) main_colors = { @@ -124,14 +123,16 @@ def read_data(template: Path, features: List[str], modules: List[str] = None) -> return proksee_data -def write_contig(organism: Organism): +def write_contig(organism: Organism, genome_sequences): contigs_data_list = [] for contig in tqdm(organism.contigs, unit="contig", disable=True): - - contigs_data_list.append({"name": contig.name, - "length": contig.length, - "orientation": "+", # "seq": "".join([gene.dna for gene in contig.genes]) - }) + contig_info = {"name": contig.name, + "length": contig.length, + "orientation": "+", + } + if genome_sequences: + contig_info['seq'] = genome_sequences[contig.name] + contigs_data_list.append(contig_info) return contigs_data_list @@ -163,12 +164,6 @@ def write_genes(organism: Organism, sources: List[str]=None): for gene in tqdm(organism.rna_genes, total=organism.number_of_rnas(), unit="rnas", disable=True): - if gf.name in gf2gene: - gf2gene[gf.name].append(gene) - else: - gf2gene[gf.name] = [gene] - # annotations = {source: "|".join(list(map(str, gf.get_source(source)))) for source in gf.sources if - # source in sources} genes_data_list.append({"name": gene.name, "type": "Gene", "contig": gene.contig.name, @@ -249,46 +244,40 @@ def write_spots(pangenome: Pangenome, organism: Organism, gf2genes: Dict[str, Li def write_modules(pangenome: Pangenome, organism: Organism, gf2genes: Dict[str, List[Gene]]): modules_data_list = [] for module in tqdm(pangenome.modules, unit="Module", disable=True): - mod_orgs = set() - for gf in module.families: - mod_orgs |= set(gf.organisms) - if organism in mod_orgs: - gf_intersection = set(organism.families) & set(module.families) + gf_intersection = set(organism.families) & set(module.families) + if gf_intersection: completion = round(len(gf_intersection) / len(set(module.families)), 2) for gf in gf_intersection: for gene in gf2genes[gf.name]: modules_data_list.append({"name": f"Module_{module.ID}", - "presence": "Module", - "start": gene.start, - "stop": gene.stop, - "contig": gene.contig.name, - "legend": "Module",#f"module_{module.ID}", - "source": "Module", - "tags": [], - "meta": { - "completion": completion - }}) + "presence": "Module", + "start": gene.start, + "stop": gene.stop, + "contig": gene.contig.name, + "legend": "Module",#f"module_{module.ID}", + "source": "Module", + "tags": [], + "meta": { + "completion": completion + }}) return modules_data_list def write_proksee_organism(pangenome: Pangenome, organism: Organism, output: Path, template: Path, - features: List[str] = None, modules: List[str] = None): + features: List[str] = None, modules: List[str] = None, genome_sequences= None): - - print(len(modules), "MODULES") proksee_data = read_data(template=template, features=features, modules=None) if "name" not in proksee_data["cgview"]["captions"]: proksee_data["cgview"]["captions"][0]["name"] = f"{organism.name} annotated with PPanGGOLiN" - proksee_data["cgview"]["sequence"]["contigs"] = write_contig(organism) + proksee_data["cgview"]["sequence"]["contigs"] = write_contig(organism, genome_sequences) if "features" not in proksee_data["cgview"]: proksee_data["cgview"]["features"] = [] genes_features, gf2genes = write_genes(organism, sources=None) - print(len(genes_features)) proksee_data["cgview"]["features"] += genes_features proksee_data["cgview"]["features"] += write_partition(organism) From 4df48a6c40ce943ea94bb7d0ad765cb459bb15f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 3 Oct 2023 18:16:19 +0200 Subject: [PATCH 138/173] Replace Prodigal by Pyrodigal --- VERSION | 2 +- ppanggolin/annotate/synta.py | 79 +++++++++++++++++++---------------- ppanggolin/cluster/cluster.py | 2 + 3 files changed, 45 insertions(+), 38 deletions(-) diff --git a/VERSION b/VERSION index 0da01cf8..1b3db87e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.187 +1.2.188 diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index c21dbdfd..fb649eb0 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -6,12 +6,16 @@ import os import tempfile from io import TextIOWrapper +from concurrent.futures import ThreadPoolExecutor from subprocess import Popen, PIPE import ast from collections import defaultdict from typing import Dict, List, Union from pathlib import Path +# install libraries +from pyrodigal import GeneFinder, TrainingInfo + # local libraries from ppanggolin.genome import Organism, Gene, RNA, Contig from ppanggolin.utils import is_compressed, read_compressed_or_not @@ -68,44 +72,45 @@ def launch_aragorn(fna_file: str, org: Organism) -> defaultdict: return gene_objs -def launch_prodigal(fna_file: str, org: Organism, code: int = 11, procedure: str = None) -> defaultdict: +def launch_prodigal(fna_file: TextIOWrapper, org: Organism, code: int = 11, use_meta: bool = False) -> defaultdict: """ Launches Prodigal to annotate CDS. Takes a fna file name and a locustag to give an ID to the found genes. - :param fna_file: file-like object containing the uncompressed fasta sequences + :param fna_file: File-like object containing the uncompressed fasta sequences :param org: Organism which will be annotated :param code: Translation table (genetic code) to use. - :param procedure: prodigal procedure used + :param use_meta: use meta procedure in Prodigal :return: Annotated genes in a list of gene objects """ - - locustag = org.name - cmd = list(map(str, ["prodigal", "-f", "sco", "-g", code, "-m", "-c", "-i", fna_file, "-p", procedure, "-q"])) - logging.getLogger("PPanGGOLiN").debug(f"prodigal command : {' '.join(cmd)}") - p = Popen(cmd, stdout=PIPE) - gene_objs = defaultdict(set) - c = 0 - header = "" - for line in p.communicate()[0].decode().split("\n"): - if line.startswith("# Sequence Data: "): - for data in line.split(";"): - if data.startswith("seqhdr"): - header = data.split("=")[1].replace('"', "").split()[0] - - elif line.startswith(">"): - c += 1 - line_data = line[1:].split("_") # not considering the '>' - gene = Gene(gene_id=locustag + "_CDS_" + str(c).zfill(4)) - gene.fill_annotations(start=int(line_data[1]), stop=int(line_data[2]), strand=line_data[3], gene_type="CDS", - genetic_code=code) - gene_objs[header].add(gene) - + sequences = {} + with open(fna_file.name, "r") as file: + for line in file.readlines(): + if line.startswith('>'): + contig_name = line.replace(">", "").replace("\n", "") + sequences[contig_name] = "" + else: + sequences[contig_name] += line.replace("\n", "") + gene_finder = GeneFinder( + meta=use_meta, # '-p meta' if meta is true else '-p single' + closed=True, # -c: Closed ends. Do not allow genes to run off edges. + mask=True # -m: Treat runs of N as masked sequence; don't build genes across them. + ) + gene_finder.train(max(sequences.values(), key=len), force_nonsd=False, + translation_table=code) # -g: Specify a translation table to use (default 11). + gene_counter = 0 + for contig_name, sequence in sequences.items(): + for found in gene_finder.find_genes(sequence): + gene = Gene(gene_id=f"{org.name}_CDS_{str(gene_counter).zfill(4)}") + gene.fill_annotations(start=found.begin, stop=found.end, strand='-' if found.strand == -1 else '+', + gene_type="CDS", genetic_code=code) + gene_counter += 1 + gene_objs[contig_name].add(gene) return gene_objs -def launch_infernal(fna_file: str, org: Organism, tmpdir: str, kingdom: str = "bacteria") -> defaultdict: +def launch_infernal(fna_file: str, org: Organism, tmpdir: str, kingdom: str = "bacteria") -> defaultdict: """ Launches Infernal in hmmer-only mode to annotate rRNAs. @@ -214,7 +219,7 @@ def write_tmp_fasta(contigs: dict, tmpdir: str) -> tempfile._TemporaryFileWrappe def syntaxic_annotation(org: Organism, fasta_file: TextIOWrapper, tmpdir: str, norna: bool = False, - kingdom: str = "bacteria", code: int = 11, procedure: str = None) -> defaultdict: + kingdom: str = "bacteria", code: int = 11, use_meta: bool = False) -> defaultdict: """ Runs the different software for the syntaxic annotation. @@ -224,14 +229,14 @@ def syntaxic_annotation(org: Organism, fasta_file: TextIOWrapper, tmpdir: str, n :param norna: Use to avoid annotating RNA features. :param kingdom: Kingdom to which the prokaryota belongs to, to know which models to use for rRNA annotation. :param code: Translation table (genetic code) to use. - :param procedure: prodigal procedure used + :param use_meta: Use meta prodigal procedure :return: list of genes in the organism """ # launching tools for syntaxic annotation genes = defaultdict(list) - for key, items in launch_prodigal(fna_file=fasta_file.name, org=org, code=code, procedure=procedure).items(): + for key, items in launch_prodigal(fna_file=fasta_file, org=org, code=code, use_meta=use_meta).items(): genes[key].extend(items) if not norna: for key, items in launch_aragorn(fna_file=fasta_file.name, org=org).items(): @@ -277,8 +282,7 @@ def overlap_filter(all_genes: defaultdict, allow_overlap: bool = False) -> defau def get_dna_sequence(contig_seq: str, gene: Gene) -> str: - """ - Return the gene sequence + """Return the gene sequence :param contig_seq: Contig sequence :param gene: Gene @@ -317,13 +321,14 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs: List[str if is_compressed(file_name): # TODO simply copy file with shutil.copyfileobj fasta_file = write_tmp_fasta(contig_sequences, tmpdir) if procedure is None: # prodigal procedure is not force by user - all_contig_len = sum(len(contig) for contig in org.contigs) - logging.getLogger("PPanGGOLiN").debug(all_contig_len) - if all_contig_len < 20000: # case of short sequence - procedure = "meta" + max_contig_len = max(len(contig) for contig in org.contigs) + if max_contig_len < 20000: # case of short sequence + use_meta = True else: - procedure = "single" - genes = syntaxic_annotation(org, fasta_file, tmpdir, norna, kingdom, code, procedure) + use_meta = False + else: + use_meta = True if procedure == "meta" else False + genes = syntaxic_annotation(org, fasta_file, tmpdir, norna, kingdom, code, use_meta) genes = overlap_filter(genes, allow_overlap=allow_overlap) for contig_name, genes in genes.items(): diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 77637015..80c40943 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -260,6 +260,8 @@ def read_gene2fam(pangenome: Pangenome, gene_to_fam: dict, disable_bar: bool = F link = True if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"] else False if link and len(gene_to_fam) != pangenome.number_of_genes: # then maybe there are genes with identical IDs + logging.getLogger("PPanGGOLiN").debug(f"gene_to_fam size: {len(gene_to_fam)}, " + f"Pangenome nb genes: {pangenome.number_of_genes}") raise Exception("Something unexpected happened during clustering (have less genes clustered than genes " "in the pangenome). A probable reason is that two genes in two different organisms have " "the same IDs; If you are sure that all of your genes have non identical IDs, please post an " From 9d252aea44889f43b31ca60a4bd111f3ea086b35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Wed, 4 Oct 2023 10:10:52 +0200 Subject: [PATCH 139/173] Change prodigal dependency for pyrodigal --- VERSION | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/VERSION b/VERSION index 1b3db87e..3cff2901 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.188 +1.2.189 diff --git a/requirements.txt b/requirements.txt index 2b72f9a1..45c799ce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ tqdm>=4.64 pytables>=3.7 -prodigal>=2.6.3 +pyrodigal>=3.0.1 aragorn>=1.2.41 infernal>=1.1.4 mmseqs2>=13.45111 From e6fb61f56123480081b8d12eef18c70c03c18cd4 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Oct 2023 11:24:59 +0200 Subject: [PATCH 140/173] update doc for new context parameters --- docs/user/Genomic-context.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/user/Genomic-context.md b/docs/user/Genomic-context.md index a9f66ef1..e8c98d1a 100644 --- a/docs/user/Genomic-context.md +++ b/docs/user/Genomic-context.md @@ -43,8 +43,10 @@ In **sequence Id**, it is possible to find a NA value. This case, correspond to ## Detailed options | option name | Description | |-----------------------------|---------------------------------------------------------------------------| -| --no_defrag | Do not use the defragmentation step, to align sequences with MMseqs2 | -| --identity | Minimum identity percentage threshold | -| --coverage | Minimum coverage percentage threshold | -| -t, --transitive | Size of the transitive closure used to build the graph. This indicates the number of non-related genes allowed in-between two related genes. Increasing it will improve precision but lower sensitivity a little. | -| -s, --jaccard | Minimum jaccard similarity used to filter edges between gene families. Increasing it will improve precision but lower sensitivity a lot. | \ No newline at end of file +| --fast | Use representative sequences of gene families for input gene alignment. This option is recommended for faster processing but may be less sensitive. By default, all pangenome genes are used for alignment. This argument makes sense only when --sequence is provided. (default: False) | +| --no_defrag | Do not use the defragmentation step, to align sequences with MMseqs2 (default: False) | +| --identity | Minimum identity percentage threshold (default: 0.8)| +| --coverage | Minimum coverage percentage threshold (default: 0.8)| +| -t, --transitive | Size of the transitive closure used to build the graph. This indicates the number of non-related genes allowed in-between two related genes. Increasing it will improve precision but lower sensitivity a little. (default: 4) | +| -s, --jaccard | Minimum jaccard similarity used to filter edges between gene families. Increasing it will improve precision but lower sensitivity a lot. (default: 0.85) | +| -w, --window_size | Number of neighboring genes that are considered on each side of a gene of interest when searching for conserved genomic contexts. (default: 5) | \ No newline at end of file From 2d5ecf63c3bcf83e095a64e0947562f834eeed60 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Oct 2023 14:41:33 +0200 Subject: [PATCH 141/173] add a first completeness metric in projection report --- ppanggolin/projection/projection.py | 31 ++++++++++++++++------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 097b14f2..9eb01c57 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -10,7 +10,7 @@ import time from pathlib import Path import tempfile -from typing import Tuple, Set, Dict, Iterator, Optional, List, Iterable, Any +from typing import Tuple, Set, Dict, Optional, List, Iterable, Any from collections import defaultdict import csv from itertools import chain @@ -102,12 +102,26 @@ def launch(args: argparse.Namespace): check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar, need_rgp=predict_rgp, need_modules=project_modules, need_gene_sequences=False, need_spots=project_spots) - - + + print("number_of_organisms", pangenome.number_of_organisms) logging.getLogger('PPanGGOLiN').info('Retrieving parameters from the provided pangenome file.') pangenome_params = argparse.Namespace( **{step: argparse.Namespace(**k_v) for step, k_v in pangenome.parameters.items()}) + # dup margin value here is specified in argument and is used to compute completeness. + # Thats mean it can be different than dup margin used in spot and RGPS. + + # TODO make this single_copy_fams a method of class Pangenome that should be used in write --stats + single_copy_fams = set() + + for fam in pangenome.gene_families: + if fam.named_partition == "persistent": + dup = len([genes for genes in fam.get_org_dict().values() if + len([gene for gene in genes if not gene.is_fragment]) > 1]) + + if (dup / fam.number_of_organisms) < args.dup_margin: + single_copy_fams.add(fam) + genome_name_to_fasta_path, genome_name_to_annot_path = None, None @@ -196,17 +210,6 @@ def launch(args: argparse.Namespace): input_orgs_to_modules = project_and_write_modules(pangenome, organisms, output_dir) organism_2_summary = {} - # dup margin value here is specified in argument and is used to compute completeness. - # Thats mean it can be different than dup margin used in spot and RGPS. - single_copy_fams = set() - - for fam in pangenome.gene_families: - if fam.named_partition == "persistent": - dup = len([genes for genes in fam.get_org_dict().values() if - len([gene for gene in genes if not gene.is_fragment]) > 1]) - - if (dup / fam.number_of_organisms) < args.dup_margin: - single_copy_fams.add(fam) for organism in organisms: # summarize projection for all input organisms From 78bbaf5da631142693f75a360bbd3c06ca054318 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Oct 2023 14:54:21 +0200 Subject: [PATCH 142/173] add completeness column in doc --- docs/user/Outputs.md | 2 +- docs/user/projection.md | 1 + ppanggolin/projection/projection.py | 1 - 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/user/Outputs.md b/docs/user/Outputs.md index 70a0bebb..12abb4e7 100644 --- a/docs/user/Outputs.md +++ b/docs/user/Outputs.md @@ -100,7 +100,7 @@ This file is made of 15 columns described in the following table | nb_cloud_genes | The number of genes whose family is cloud in that genome | | nb_exact_core_genes | The number of genes whose family is exact core in that genome | | nb_soft_core_genes | The number of genes whose family is soft core in that genome | -| completeness | This is an indicator of the proportion of single copy markers in the persistent that are present in the genome. While it is expected to be relatively close to 100 when working with isolates, it may be particularly interesting when working with very fragmented genomes as this provide a *de novo* estimation of the completess based on the expectation that single copy markers within the persistent should be mostly present in all individuals of the studied taxonomic group | +| completeness | This is an indicator of the proportion of single copy markers in the persistent that are present in the genome. While it is expected to be relatively close to 100 when working with isolates, it may be particularly interesting when working with very fragmented genomes as this provide a *de novo* estimation of the completeness based on the expectation that single copy markers within the persistent should be mostly present in all individuals of the studied taxonomic group | | nb_single_copy_markers | This indicates the number of present single copy markers in the genomes. They are computed using the parameter duplication_margin indicated at the beginning of the file. They correspond to all of the persistent gene families that are not present in more than one copy in 5% (or more) of the genomes by default. | It can be generated using the 'write' subcommand as such : diff --git a/docs/user/projection.md b/docs/user/projection.md index e8e35989..c4065a2f 100644 --- a/docs/user/projection.md +++ b/docs/user/projection.md @@ -35,6 +35,7 @@ The Output directory contains `summary_projection.tsv` giving an overview of the | Cloud genes | The number of genes in the "Cloud" partition.| | Cloud families | The number of gene families in the "Cloud" parition.| | Cloud specific families | The number of gene families that are specific to the input organism. These families are unique to the input organism and do not have homologs in any other genomes within the pangenome and have been assigned to the "Cloud" partition.| +| completeness | This indicates the proportion of single copy markers from the persistent partition that are present in the genome. While it is expected to be relatively close to 100 when working with isolates, it may be particularly interesting when working with very fragmented genomes as this provide a *de novo* estimation of the completeness based on the expectation that single copy markers within the persistent should be mostly present in all individuals of the studied taxonomic group. | | RGPs (Regions of Genomic Plasticity) | The number of Regions of Genomic Plasticity (RGPs) predicted within the input genome.| | Spots | The total number of spots of insertion associated with RGPs in the input genome.| | New spots | The number of new insertion spots that have been identified in the input genome. These spots represent novel genomic regions compared to other genomes in the pangenome.| diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 9eb01c57..335f89ec 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -497,7 +497,6 @@ def summarize_projection(input_organism:Organism, pangenome:Pangenome, single_c "Shell": {"genes":shell_gene_count, "families":shell_family_count}, "Cloud": {"genes":cloud_gene_count, "families":cloud_family_count - singleton_gene_count, "specific families":singleton_gene_count}, "Completeness":completeness, - "Single copy markers":single_copy_markers_count, "RGPs": rgp_count, "Spots": spot_count, "New spots": new_spot_count, From 045e78afaab9006c8fc159657209c98e62cdd28b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 9 Oct 2023 18:19:30 +0200 Subject: [PATCH 143/173] SpeedUp by using Sequence class from pyrodigal --- VERSION | 2 +- ppanggolin/annotate/synta.py | 33 +++++++++++++++++++++------------ 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/VERSION b/VERSION index 3cff2901..1b38fdcf 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.189 +1.2.190 diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index fb649eb0..3a774cc5 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -14,7 +14,7 @@ from pathlib import Path # install libraries -from pyrodigal import GeneFinder, TrainingInfo +from pyrodigal import GeneFinder, Sequence # local libraries from ppanggolin.genome import Organism, Gene, RNA, Contig @@ -74,7 +74,7 @@ def launch_aragorn(fna_file: str, org: Organism) -> defaultdict: def launch_prodigal(fna_file: TextIOWrapper, org: Organism, code: int = 11, use_meta: bool = False) -> defaultdict: """ - Launches Prodigal to annotate CDS. Takes a fna file name and a locustag to give an ID to the found genes. + Launches Prodigal to annotate CDS. Takes a fna file name and a locustag to give an ID to the pred genes. :param fna_file: File-like object containing the uncompressed fasta sequences :param org: Organism which will be annotated @@ -83,15 +83,24 @@ def launch_prodigal(fna_file: TextIOWrapper, org: Organism, code: int = 11, use_ :return: Annotated genes in a list of gene objects """ + def write_seq(fna_file: TextIOWrapper): + sequences = {} + contig_name = None + with open(fna_file.name, "r") as file: + lines = file.readlines() + for line in lines: + if line.startswith('>'): + contig_name = line.replace(">", "").replace("\n", "") + seq = "" + else: + while not line.startswith('>') and len(lines) > 0: + seq += line.replace("\n", "") + line = lines.pop(0) + sequences[contig_name] = Sequence(seq) + return sequences + gene_objs = defaultdict(set) - sequences = {} - with open(fna_file.name, "r") as file: - for line in file.readlines(): - if line.startswith('>'): - contig_name = line.replace(">", "").replace("\n", "") - sequences[contig_name] = "" - else: - sequences[contig_name] += line.replace("\n", "") + sequences = write_seq(fna_file) gene_finder = GeneFinder( meta=use_meta, # '-p meta' if meta is true else '-p single' closed=True, # -c: Closed ends. Do not allow genes to run off edges. @@ -101,9 +110,9 @@ def launch_prodigal(fna_file: TextIOWrapper, org: Organism, code: int = 11, use_ translation_table=code) # -g: Specify a translation table to use (default 11). gene_counter = 0 for contig_name, sequence in sequences.items(): - for found in gene_finder.find_genes(sequence): + for pred in gene_finder.find_genes(sequence): gene = Gene(gene_id=f"{org.name}_CDS_{str(gene_counter).zfill(4)}") - gene.fill_annotations(start=found.begin, stop=found.end, strand='-' if found.strand == -1 else '+', + gene.fill_annotations(start=pred.begin, stop=pred.end, strand='-' if pred.strand == -1 else '+', gene_type="CDS", genetic_code=code) gene_counter += 1 gene_objs[contig_name].add(gene) From 635c5e18adff126c779623a746f9f0f9a874b245 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 9 Oct 2023 18:33:10 +0200 Subject: [PATCH 144/173] Add documentation and clean code --- VERSION | 2 +- ppanggolin/annotate/synta.py | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/VERSION b/VERSION index 1b38fdcf..70c79534 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.190 +1.2.191 diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 3a774cc5..10b20112 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -6,7 +6,6 @@ import os import tempfile from io import TextIOWrapper -from concurrent.futures import ThreadPoolExecutor from subprocess import Popen, PIPE import ast from collections import defaultdict @@ -83,7 +82,13 @@ def launch_prodigal(fna_file: TextIOWrapper, org: Organism, code: int = 11, use_ :return: Annotated genes in a list of gene objects """ - def write_seq(fna_file: TextIOWrapper): + def write_seq(fna_file: TextIOWrapper) -> Dict[str, Sequence]: + """Write contig sequence to predict genes with pyrodigal + + :param fna_file: Fasta file with sequences + + :return: Contig sequence link to contig name + """ sequences = {} contig_name = None with open(fna_file.name, "r") as file: @@ -108,7 +113,7 @@ def write_seq(fna_file: TextIOWrapper): ) gene_finder.train(max(sequences.values(), key=len), force_nonsd=False, translation_table=code) # -g: Specify a translation table to use (default 11). - gene_counter = 0 + gene_counter = 1 for contig_name, sequence in sequences.items(): for pred in gene_finder.find_genes(sequence): gene = Gene(gene_id=f"{org.name}_CDS_{str(gene_counter).zfill(4)}") From 721fac78c1bcdec00d90bfe22d7cd1f93ae5d239 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 10 Oct 2023 11:02:33 +0200 Subject: [PATCH 145/173] Fix bug in writing sequence for pyrodigal --- VERSION | 2 +- ppanggolin/annotate/synta.py | 45 +++++++++++------------------------- 2 files changed, 14 insertions(+), 33 deletions(-) diff --git a/VERSION b/VERSION index 70c79534..a11c24f7 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.191 +1.2.192 diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 10b20112..1ff96001 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -64,52 +64,31 @@ def launch_aragorn(fna_file: str, org: Organism) -> defaultdict: line_data = line.split() start, stop = map(int, ast.literal_eval(line_data[2].replace("c", ""))) c += 1 - gene = RNA(rna_id=locustag + '_tRNA_' + str(c).zfill(3)) + gene = RNA(rna_id=locustag + '_tRNA_' + str(c).zfill(4)) gene.fill_annotations(start=start, stop=stop, strand="-" if line_data[2].startswith("c") else "+", gene_type="tRNA", product=line_data[1] + line_data[4]) gene_objs[header].add(gene) return gene_objs -def launch_prodigal(fna_file: TextIOWrapper, org: Organism, code: int = 11, use_meta: bool = False) -> defaultdict: +def launch_prodigal(contig_sequences: Dict[str, str], org: Organism, code: int = 11, use_meta: bool = False) -> defaultdict: """ Launches Prodigal to annotate CDS. Takes a fna file name and a locustag to give an ID to the pred genes. - :param fna_file: File-like object containing the uncompressed fasta sequences + :param contig_sequences: Dict containing contig sequences for pyrodigal :param org: Organism which will be annotated :param code: Translation table (genetic code) to use. :param use_meta: use meta procedure in Prodigal :return: Annotated genes in a list of gene objects """ - def write_seq(fna_file: TextIOWrapper) -> Dict[str, Sequence]: - """Write contig sequence to predict genes with pyrodigal - - :param fna_file: Fasta file with sequences - - :return: Contig sequence link to contig name - """ - sequences = {} - contig_name = None - with open(fna_file.name, "r") as file: - lines = file.readlines() - for line in lines: - if line.startswith('>'): - contig_name = line.replace(">", "").replace("\n", "") - seq = "" - else: - while not line.startswith('>') and len(lines) > 0: - seq += line.replace("\n", "") - line = lines.pop(0) - sequences[contig_name] = Sequence(seq) - return sequences - gene_objs = defaultdict(set) - sequences = write_seq(fna_file) + sequences = {contig_name: Sequence(sequence) for contig_name, sequence in contig_sequences.items()} gene_finder = GeneFinder( meta=use_meta, # '-p meta' if meta is true else '-p single' closed=True, # -c: Closed ends. Do not allow genes to run off edges. - mask=True # -m: Treat runs of N as masked sequence; don't build genes across them. + mask=True, # -m: Treat runs of N as masked sequence; don't build genes across them. + min_gene=120 # This is to prevent erreur with mmseqs translatenucs that cut too short sequences ) gene_finder.train(max(sequences.values(), key=len), force_nonsd=False, translation_table=code) # -g: Specify a translation table to use (default 11). @@ -164,7 +143,7 @@ def launch_infernal(fna_file: str, org: Organism, tmpdir: str, kingdom: str = "b line_data = line.split() strand = line_data[9] start, stop = map(int, (line_data[8], line_data[7]) if strand == "-" else (line_data[7], line_data[8])) - gene = RNA(rna_id=locustag + "_rRNA_" + str(c).zfill(3)) + gene = RNA(rna_id=locustag + "_rRNA_" + str(c).zfill(4)) gene.fill_annotations(start=start, stop=stop, strand=strand, gene_type="rRNA", product=" ".join(line_data[17:])) gene_objs[line_data[2]].add(gene) @@ -232,13 +211,15 @@ def write_tmp_fasta(contigs: dict, tmpdir: str) -> tempfile._TemporaryFileWrappe return tmp_file -def syntaxic_annotation(org: Organism, fasta_file: TextIOWrapper, tmpdir: str, norna: bool = False, - kingdom: str = "bacteria", code: int = 11, use_meta: bool = False) -> defaultdict: +def syntaxic_annotation(org: Organism, fasta_file: TextIOWrapper, contig_sequences: Dict[str, str], + tmpdir: str, norna: bool = False, kingdom: str = "bacteria", + code: int = 11, use_meta: bool = False) -> defaultdict: """ Runs the different software for the syntaxic annotation. :param org: Organism which will be annotated :param fasta_file: file-like object containing the uncompressed fasta sequences + :param contig_sequences: Dict containing contig sequences for pyrodigal :param tmpdir: Path to temporary directory :param norna: Use to avoid annotating RNA features. :param kingdom: Kingdom to which the prokaryota belongs to, to know which models to use for rRNA annotation. @@ -250,7 +231,7 @@ def syntaxic_annotation(org: Organism, fasta_file: TextIOWrapper, tmpdir: str, n # launching tools for syntaxic annotation genes = defaultdict(list) - for key, items in launch_prodigal(fna_file=fasta_file, org=org, code=code, use_meta=use_meta).items(): + for key, items in launch_prodigal(contig_sequences=contig_sequences, org=org, code=code, use_meta=use_meta).items(): genes[key].extend(items) if not norna: for key, items in launch_aragorn(fna_file=fasta_file.name, org=org).items(): @@ -342,7 +323,7 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs: List[str use_meta = False else: use_meta = True if procedure == "meta" else False - genes = syntaxic_annotation(org, fasta_file, tmpdir, norna, kingdom, code, use_meta) + genes = syntaxic_annotation(org, fasta_file, contig_sequences, tmpdir, norna, kingdom, code, use_meta) genes = overlap_filter(genes, allow_overlap=allow_overlap) for contig_name, genes in genes.items(): From 77c3be804a6035c61d39bbb25eee0e0c8161f4d1 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 12 Oct 2023 13:55:32 +0200 Subject: [PATCH 146/173] fix wrong completion calculation in module outputs --- ppanggolin/formats/writeFlat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 9dc352d1..30ba0b68 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -1029,8 +1029,8 @@ def write_org_modules(output: Path, compress: bool = False): for fam in mod.families: mod_orgs |= set(fam.organisms) for org in mod_orgs: - completion = round((org.number_of_families() + len(mod)) / len(mod), 2) - fout.write(f"module_{mod.ID}\t{org.name}\t{completion}\n") + completion = len(set(org.families) & set(mod.families)) / len(mod) + fout.write(f"module_{mod.ID}\t{org.name}\t{completion:.2}\n") fout.close() logging.getLogger("PPanGGOLiN").info( f"Done writing modules to organisms associations to: '{output.as_posix() + '/modules_in_organisms.tsv'}'") From 9b80ceb325e779dab94aad1d576b6165fbd3840c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 13 Oct 2023 10:11:28 +0200 Subject: [PATCH 147/173] Resolve requested change --- VERSION | 2 +- ppanggolin/align/alignOnPang.py | 171 +++++++------ ppanggolin/context/searchGeneContext.py | 317 ++++++++++++------------ ppanggolin/genome.py | 4 +- ppanggolin/region.py | 40 ++- 5 files changed, 289 insertions(+), 245 deletions(-) diff --git a/VERSION b/VERSION index 70c79534..a11c24f7 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.191 +1.2.192 diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index eed33323..28a6bc83 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -9,10 +9,9 @@ import subprocess import argparse from collections import defaultdict, Counter -from typing import List, Tuple, Set, Dict, IO, Iterator, Iterable +from typing import List, Tuple, Set, Dict, IO, Iterable from pathlib import Path - from tqdm import tqdm # local libraries @@ -29,7 +28,7 @@ def create_mmseqs_db(seq_files: Iterable[Path], tmpdir: Path, basename="sequence """ Create a MMseqs2 sequence database with the given fasta files. - :param seq_file: An iterable of path of FASTA files. + :param seq_files: An iterable of path of FASTA files. :param tmpdir: Path to the temporary directory where the database will be created. :param basename: Prefix for the database file (default: "sequences"). @@ -37,13 +36,14 @@ def create_mmseqs_db(seq_files: Iterable[Path], tmpdir: Path, basename="sequence """ with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False, suffix=".DB", prefix=basename) as seqdb: - cmd = ["mmseqs", "createdb"] + [seq_file.as_posix() for seq_file in seq_files] + [seqdb.name, '--dbtype', '0'] - + cmd = ["mmseqs", "createdb"] + [seq_file.as_posix() for seq_file in seq_files] + [seqdb.name, '--dbtype', '0'] + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL) return Path(seqdb.name) + def translate_with_mmseqs(seqdb: Path, translation_table: int, cpu: int, tmpdir: Path) -> Path: """ Translate nucleotide sequences in an MMseqs2 sequence database to amino acid sequences. @@ -56,21 +56,21 @@ def translate_with_mmseqs(seqdb: Path, translation_table: int, cpu: int, tmpdir: :return: Path to the new MMseqs2 sequence database containing translated amino acid sequences. """ - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False, prefix=seqdb.stem, suffix=".aa.DB") as seqdb_aa: + with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False, prefix=seqdb.stem, + suffix=".aa.DB") as seqdb_aa: + cmd = ["mmseqs", "translatenucs", seqdb.as_posix(), seqdb_aa.name, "--translation-table", + f"{translation_table}", "--threads", str(cpu)] - cmd = ["mmseqs", "translatenucs", seqdb.as_posix(), seqdb_aa.name, "--translation-table", - f"{translation_table}", "--threads", str(cpu)] - logging.getLogger().debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - + return Path(seqdb_aa.name) -def align_seq_to_pang(target_seq_file:Path , query_seq_files: Iterable[Path], +def align_seq_to_pang(target_seq_file: Path, query_seq_files: Iterable[Path], tmpdir: Path, cpu: int = 1, no_defrag: bool = False, - identity: float = 0.8, coverage: float = 0.8, - is_query_nt:bool = False, is_target_nt:bool = False, translation_table: int = None) -> Path: + identity: float = 0.8, coverage: float = 0.8, + is_query_nt: bool = False, is_target_nt: bool = False, translation_table: int = None) -> Path: """ Align fasta sequence to pangenome sequences. @@ -92,36 +92,41 @@ def align_seq_to_pang(target_seq_file:Path , query_seq_files: Iterable[Path], query_db = create_mmseqs_db(query_seq_files, tmpdir, basename="query_sequences") if is_target_nt: - logging.getLogger().debug(f"Target sequences will be translated by mmseqs with translation table {translation_table}") + logging.getLogger().debug( + f"Target sequences will be translated by mmseqs with translation table {translation_table}") target_db = translate_with_mmseqs(target_db, translation_table, cpu, tmpdir) if is_query_nt: - logging.getLogger().debug(f"Query sequences will be translated by mmseqs with translation table {translation_table}") - query_db = translate_with_mmseqs(query_db, translation_table, cpu, tmpdir) + logging.getLogger().debug( + f"Query sequences will be translated by mmseqs with translation table {translation_table}") + query_db = translate_with_mmseqs(query_db, translation_table, cpu, tmpdir) cov_mode = "2" # coverage of query - if no_defrag: - cov_mode = "0" # coverage of query and target - + if no_defrag: + cov_mode = "0" # coverage of query and target + # mmseqs search command # see https://github.com/soedinglab/MMseqs2/issues/373 Using a combination of param to no miss short proteins - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), prefix="aln_result_db_file", suffix=".aln.DB", delete=False) as aln_db: - cmd = ["mmseqs", "search", query_db.as_posix(), target_db.as_posix(), aln_db.name, tmpdir.as_posix(), "-a", "--min-seq-id", str(identity), - "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu), + with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), prefix="aln_result_db_file", suffix=".aln.DB", + delete=False) as aln_db: + cmd = ["mmseqs", "search", query_db.as_posix(), target_db.as_posix(), aln_db.name, tmpdir.as_posix(), "-a", + "--min-seq-id", str(identity), + "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu), "--seed-sub-mat", "VTML40.out", "-s", "2", '--comp-bias-corr', "0", "--mask", "0", "-e", "1"] - logging.getLogger().info("Aligning sequences") logging.getLogger().debug(" ".join(cmd)) start = time.time() subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) align_time = time.time() - start - logging.getLogger().info(f"Done aligning sequences in {round(align_time,2)} seconds") + logging.getLogger().info(f"Done aligning sequences in {round(align_time, 2)} seconds") - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, prefix="aln_result_db_file", suffix = ".tsv", delete=False) as outfile: - cmd = ["mmseqs", "convertalis", query_db.as_posix(), target_db.as_posix(), aln_db.name, outfile.name, "--format-mode", "2"] + with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, prefix="aln_result_db_file", suffix=".tsv", + delete=False) as outfile: + cmd = ["mmseqs", "convertalis", query_db.as_posix(), target_db.as_posix(), aln_db.name, outfile.name, + "--format-mode", "2"] logging.getLogger().info("Extracting alignments...") logging.getLogger().debug(" ".join(cmd)) @@ -130,7 +135,8 @@ def align_seq_to_pang(target_seq_file:Path , query_seq_files: Iterable[Path], return Path(outfile.name) -def map_input_gene_to_family_all_aln(aln_res: Path, outdir:Path, pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], Path]: +def map_input_gene_to_family_all_aln(aln_res: Path, outdir: Path, + pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], Path]: """ Read alignment result to link input sequences to pangenome gene family. Alignment have been made against all genes of the pangenome. @@ -143,15 +149,15 @@ def map_input_gene_to_family_all_aln(aln_res: Path, outdir:Path, pangenome: Pang """ seq2pang = {} - aln_file_clean = outdir / f"alignment_input_seqs_to_all_pangenome_genes.tsv" # write the actual result file + aln_file_clean = outdir / "alignment_input_seqs_to_all_pangenome_genes.tsv" # write the actual result file logging.getLogger().debug(f'Writing alignment file in {aln_file_clean}') - + with open(aln_res, "r") as alnFile, open(aln_file_clean, "w") as aln_outfl: for line in alnFile: line_splitted = line.split() - + line_splitted[1] = line_splitted[1].replace("ppanggolin_", "") # remove the 'ppanggolin_' bit of the id - line_splitted[0] = line_splitted[0].replace("ppanggolin_", "") + line_splitted[0] = line_splitted[0].replace("ppanggolin_", "") input_seq_id, gene_id = line_splitted[0:2] @@ -164,7 +170,8 @@ def map_input_gene_to_family_all_aln(aln_res: Path, outdir:Path, pangenome: Pang return seq2pang, aln_file_clean -def map_input_gene_to_family_rep_aln(aln_res: Path, outdir:Path, pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], str]: +def map_input_gene_to_family_rep_aln(aln_res: Path, outdir: Path, + pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], str]: """ Read alignment result to link input sequences to pangenome gene family. Alignment have been made against representative sequence of gene families of the pangenome. @@ -176,14 +183,14 @@ def map_input_gene_to_family_rep_aln(aln_res: Path, outdir:Path, pangenome: Pang :return: Dictionnary with sequence link to pangenome gene families and actual path to the cleaned alignment file """ seq2pang = {} - aln_file_clean = outdir / f"alignment_input_seqs_to_pangenome_gene_families.tsv" # write the actual result file + aln_file_clean = outdir / "alignment_input_seqs_to_pangenome_gene_families.tsv" # write the actual result file logging.getLogger().debug(f'Writing alignment file in {aln_file_clean}') with open(aln_res, "r") as alnFile, open(aln_file_clean, "w") as aln_outfl: for line in alnFile: line_splitted = line.split() - + line_splitted[1] = line_splitted[1].replace("ppanggolin_", "") # remove the 'ppanggolin_' bit of the id line_splitted[0] = line_splitted[0].replace("ppanggolin_", "") @@ -198,7 +205,6 @@ def map_input_gene_to_family_rep_aln(aln_res: Path, outdir:Path, pangenome: Pang return seq2pang, aln_file_clean - def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool]: """ Get sequence IDs from a sequence input file in FASTA format and guess the sequence type based on the first sequences. @@ -211,7 +217,7 @@ def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool]: seq_set = set() seq_count = 0 first_seq_concat = "" - + for line in seq_file: if line.startswith(">"): seq_set.add(line[1:].split()[0].strip()) @@ -221,12 +227,11 @@ def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool]: char_counter = Counter(first_seq_concat) is_nucleotide = all(char in dna_expected_char for char in char_counter) - - return seq_set, is_nucleotide + return seq_set, is_nucleotide -def write_gene_fam_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", disable_bar:bool=False): +def write_gene_fam_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", disable_bar: bool = False): """ Export the sequence of gene families @@ -235,12 +240,14 @@ def write_gene_fam_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", :param add: Add prefix to sequence name :param disable_bar: disable progress bar """ - for fam in tqdm(pangenome.gene_families, unit="families", disable=disable_bar, total=pangenome.number_of_gene_families): + for fam in tqdm(pangenome.gene_families, unit="families", disable=disable_bar, + total=pangenome.number_of_gene_families): file_obj.write(">" + add + fam.name + "\n") file_obj.write(fam.sequence + "\n") # file_obj.flush() -def write_all_gene_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", disable_bar:bool = False): + +def write_all_gene_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", disable_bar: bool = False): """ Export the sequence of pangenome genes @@ -257,6 +264,7 @@ def write_all_gene_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", # this should never happen if the pangenome has been properly checked before launching this function. raise Exception("The pangenome does not include gene sequences") + def project_and_write_partition(seqid_to_gene_family: Dict[str, GeneFamily], seq_set: Set[str], output: Path) -> Path: """ Project the partition of each sequence from the input file and write them in a file @@ -272,10 +280,11 @@ def project_and_write_partition(seqid_to_gene_family: Dict[str, GeneFamily], seq with open(partition_proj, "w") as partProjFile: for input_seq, pangFam in seqid_to_gene_family.items(): partProjFile.write(input_seq + "\t" + pangFam.named_partition + "\n") - for remainingSeq in seq_set - seqid_to_gene_family.keys(): + for remainingSeq in seq_set - seqid_to_gene_family.keys(): partProjFile.write(remainingSeq + "\tcloud\n") # if there is no hit, it's going to be cloud genes. return partition_proj + def write_gene_to_gene_family(seqid_to_gene_family: Dict[str, GeneFamily], seq_set: Set[str], output: Path) -> Path: """ Write input gene to gene family. @@ -292,7 +301,7 @@ def write_gene_to_gene_family(seqid_to_gene_family: Dict[str, GeneFamily], seq_s for input_seq, pangFam in seqid_to_gene_family.items(): partProjFile.write(f"{input_seq}\t{pangFam.name}\n") - for remainingSeq in seq_set - seqid_to_gene_family.keys(): + for remainingSeq in seq_set - seqid_to_gene_family.keys(): partProjFile.write(f"{remainingSeq}\t{remainingSeq}\n") # if there is no hit, gene family is itself. return gene_fam_map_file @@ -420,10 +429,11 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel logging.getLogger("PPanGGOLiN").info(f"File listing RGP and spots where sequences of interest are located : " f"{output / 'info_input_seq.tsv'}") -def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Iterable[Path], output: Path, tmpdir: Path, is_input_seq_nt:bool, - cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, - coverage: float = 0.8, translation_table: int = 11, disable_bar:bool = False) -> Tuple[Path, dict]: - + +def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Iterable[Path], output: Path, + tmpdir: Path, is_input_seq_nt: bool, cpu: int = 1, no_defrag: bool = False, + identity: float = 0.8, coverage: float = 0.8, translation_table: int = 11, + disable_bar: bool = False) -> Tuple[Path, Dict[str, GeneFamily]]: """ Assign gene families from a pangenome to input sequences. @@ -447,25 +457,26 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Itera """ # delete False to be able to keep tmp file. If they are not keep tmpdir will be destroyed so no need to delete tmpfile - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, + with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, prefix="representative_genes", suffix=".faa") as tmp_pang_file: - logging.getLogger().debug(f'Write gene family sequences in {tmp_pang_file.name}') write_gene_fam_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar) align_file = align_seq_to_pang(target_seq_file=Path(tmp_pang_file.name), query_seq_files=sequence_files, - tmpdir=tmpdir, cpu=cpu, - no_defrag=no_defrag, identity=identity, coverage=coverage, - is_query_nt=is_input_seq_nt, is_target_nt=False, - translation_table=translation_table) + tmpdir=tmpdir, cpu=cpu, + no_defrag=no_defrag, identity=identity, coverage=coverage, + is_query_nt=is_input_seq_nt, is_target_nt=False, + translation_table=translation_table) seq2pang, align_file = map_input_gene_to_family_rep_aln(align_file, output, pangenome) return align_file, seq2pang -def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Iterable[Path], output: Path, tmpdir: Path, is_input_seq_nt:bool, - cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, - translation_table: int = 11, disable_bar:bool = False) -> Tuple[set, str, dict]: + +def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Iterable[Path], output: Path, + tmpdir: Path, is_input_seq_nt: bool, cpu: int = 1, no_defrag: bool = False, + identity: float = 0.8, coverage: float = 0.8, translation_table: int = 11, + disable_bar: bool = False) -> Tuple[Path, Dict[str, GeneFamily]]: """ Assign gene families from a pangenome to input sequences. @@ -488,27 +499,26 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Itera and a dictionary mapping input sequences to gene families. """ - - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, + with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, prefix="all_pangenome_genes", suffix=".fna") as tmp_pang_file: - logging.getLogger().debug(f'Write all pangenome gene sequences in {tmp_pang_file.name}') write_all_gene_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar) align_file = align_seq_to_pang(target_seq_file=Path(tmp_pang_file.name), query_seq_files=sequence_files, - tmpdir=tmpdir, cpu=cpu, - no_defrag=no_defrag, identity=identity, coverage=coverage, - is_query_nt=is_input_seq_nt, is_target_nt=True, - translation_table=translation_table ) + tmpdir=tmpdir, cpu=cpu, + no_defrag=no_defrag, identity=identity, coverage=coverage, + is_query_nt=is_input_seq_nt, is_target_nt=True, + translation_table=translation_table) seq2pang, align_file = map_input_gene_to_family_all_aln(align_file, output, pangenome) return align_file, seq2pang + def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: float = 0.8, coverage: float = 0.8, no_defrag: bool = False, cpu: int = 1, getinfo: bool = False, use_representatives: bool = False, - draw_related: bool = False, translation_table:int=11, tmpdir: Path = None, + draw_related: bool = False, translation_table: int = 11, tmpdir: Path = None, disable_bar: bool = False, keep_tmp=False): """ Aligns pangenome sequences with sequences in a FASTA file using MMSeqs2. @@ -529,7 +539,6 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo :param keep_tmp: If True, keep temporary files. """ - tmpdir = Path(tempfile.gettempdir()) if tmpdir is None else tmpdir if pangenome.status["geneFamilySequences"] not in ["inFile", "Loaded", "Computed"]: raise Exception("Cannot use this function as your pangenome does not have gene families representatives " @@ -546,22 +555,27 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo need_spots=True, need_modules=need_mod, disable_bar=disable_bar) else: check_pangenome_info(pangenome, need_families=True, disable_bar=disable_bar) - + with read_compressed_or_not(sequence_file) as seqFileObj: seq_set, is_nucleotide = get_seq_ids(seqFileObj) with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: - if use_representatives: - align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, [sequence_file], output, new_tmpdir, is_input_seq_nt=is_nucleotide, - cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, - translation_table=translation_table, disable_bar=disable_bar) + align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, [sequence_file], output, new_tmpdir, + is_input_seq_nt=is_nucleotide, + cpu=cpu, no_defrag=no_defrag, identity=identity, + coverage=coverage, + translation_table=translation_table, + disable_bar=disable_bar) else: align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=[sequence_file], - output=output, tmpdir=new_tmpdir, is_input_seq_nt=is_nucleotide, - cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, - translation_table=translation_table, disable_bar=disable_bar) + output=output, tmpdir=new_tmpdir, + is_input_seq_nt=is_nucleotide, + cpu=cpu, no_defrag=no_defrag, identity=identity, + coverage=coverage, + translation_table=translation_table, + disable_bar=disable_bar) if getinfo or draw_related: # TODO Add getinfo to function and remove if get_seq_info(seq2pang, pangenome, output, draw_related, disable_bar=disable_bar) @@ -585,7 +599,7 @@ def launch(args: argparse.Namespace): tmpdir=args.tmpdir, identity=args.identity, coverage=args.coverage, no_defrag=args.no_defrag, cpu=args.cpu, getinfo=args.getinfo, use_representatives=args.fast, draw_related=args.draw_related, - translation_table=args.translation_table, + translation_table=args.translation_table, disable_bar=args.disable_prog_bar, keep_tmp=args.keep_tmp) @@ -626,8 +640,8 @@ def parser_align(parser: argparse.ArgumentParser): optional.add_argument('--coverage', required=False, type=float, default=0.8, help="min coverage percentage threshold") optional.add_argument("--fast", required=False, action="store_true", - help="Use representative sequences of gene families for input gene alignment. " - "This option is faster but may be less sensitive. By default, all pangenome genes are used.") + help="Use representative sequences of gene families for input gene alignment. " + "This option is faster but may be less sensitive. By default, all pangenome genes are used.") optional.add_argument("--translation_table", required=False, default="11", help="Translation table (genetic code) to use.") optional.add_argument("--getinfo", required=False, action="store_true", @@ -644,11 +658,12 @@ def parser_align(parser: argparse.ArgumentParser): optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", - help="Keeping temporary files (useful for debugging).") + help="Keeping temporary files (useful for debugging).") + if __name__ == '__main__': """To test local change and allow using debugger""" - from ppanggolin.utils import check_log, set_verbosity_level, add_common_arguments + from ppanggolin.utils import set_verbosity_level, add_common_arguments main_parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 6d8188d7..1ca725c7 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -21,10 +21,10 @@ # local libraries from ppanggolin.formats import check_pangenome_info from ppanggolin.genome import Gene, Contig -from ppanggolin.utils import mk_outdir, restricted_float, add_gene, connected_components, create_tmpdir, read_compressed_or_not -from ppanggolin.geneFamily import GeneFamily +from ppanggolin.utils import mk_outdir, restricted_float, create_tmpdir, read_compressed_or_not from ppanggolin.pangenome import Pangenome -from ppanggolin.align.alignOnPang import project_and_write_partition, get_input_seq_to_family_with_rep, get_input_seq_to_family_with_all, get_seq_ids +from ppanggolin.align.alignOnPang import project_and_write_partition, get_input_seq_to_family_with_rep, \ + get_input_seq_to_family_with_all, get_seq_ids from ppanggolin.region import GeneContext from ppanggolin.geneFamily import GeneFamily from ppanggolin.projection.projection import write_gene_to_gene_family @@ -32,9 +32,11 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: Path, sequence_file: Path = None, families: Path = None, transitive: int = 4, identity: float = 0.5, - coverage: float = 0.8, use_representatives: bool = False, jaccard_threshold: float = 0.85, + coverage: float = 0.8, use_representatives: bool = False, + jaccard_threshold: float = 0.85, window_size: int = 1, no_defrag: bool = False, - cpu: int = 1, graph_format:str = "graphml", disable_bar=True, translation_table:int=11, keep_tmp:bool = False): + cpu: int = 1, graph_format: str = "graphml", disable_bar=True, + translation_table: int = 11, keep_tmp: bool = False): """ Main function to search common gene contexts between sequence set and pangenome families @@ -53,6 +55,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: :param cpu: Number of core used to process :param graph_format: Write format of the context graph. Can be graphml or gexf :param disable_bar: Allow preventing bar progress print + :param translation_table: The translation table to use when the input sequences are nucleotide sequences. :param keep_tmp: If True, keep temporary files. """ # check statuses and load info @@ -63,77 +66,82 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) families_of_interest = set() - fam_2_seq = None - + fam2seq = {} if sequence_file is not None: # Alignment of sequences on pangenome families with read_compressed_or_not(sequence_file) as seqFileObj: seq_set, is_nucleotide = get_seq_ids(seqFileObj) - logging.debug(f"Input sequences are {'nucletide' if is_nucleotide else 'protein'} sequences") + logging.debug(f"Input sequences are {'nucleotide' if is_nucleotide else 'protein'} sequences") with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: - + if use_representatives: - _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome, [sequence_file], output, new_tmpdir, is_input_seq_nt=is_nucleotide, - cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, - translation_table=translation_table, disable_bar=disable_bar) + _, seqid2fam = get_input_seq_to_family_with_rep(pangenome, [sequence_file], output, + new_tmpdir, is_input_seq_nt=is_nucleotide, + cpu=cpu, no_defrag=no_defrag, + identity=identity, coverage=coverage, + translation_table=translation_table, + disable_bar=disable_bar) else: - _, seqid_to_gene_family = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=[sequence_file], - output=output, tmpdir=new_tmpdir, is_input_seq_nt=is_nucleotide, - cpu=cpu, no_defrag=no_defrag, - identity=identity, coverage=coverage, - translation_table=translation_table, disable_bar=disable_bar) - - project_and_write_partition(seqid_to_gene_family, seq_set, output) - write_gene_to_gene_family(seqid_to_gene_family, seq_set, output) - - for pan_family in seqid_to_gene_family.values(): + _, seqid2fam = get_input_seq_to_family_with_all(pangenome=pangenome, + sequence_files=[sequence_file], + output=output, tmpdir=new_tmpdir, + is_input_seq_nt=is_nucleotide, + cpu=cpu, no_defrag=no_defrag, + identity=identity, coverage=coverage, + translation_table=translation_table, + disable_bar=disable_bar) + + project_and_write_partition(seqid2fam, seq_set, output) + write_gene_to_gene_family(seqid2fam, seq_set, output) + fam2seq = {gf.ID: seqid for seqid, gf in seqid2fam} + for pan_family in seqid2fam.values(): families_of_interest.add(pan_family) - if families is not None: with read_compressed_or_not(families) as f: for fam_name in f.read().splitlines(): families_of_interest.add(pangenome.get_gene_family(fam_name)) - # Compute the graph with transitive closure size provided as parameter start_time = time.time() logging.getLogger().info("Building the graph...") - - gene_context_graph = compute_gene_context_graph(families=families_of_interest, transitive=transitive, + + gene_context_graph = compute_gene_context_graph(families=families_of_interest, transitive=transitive, window_size=window_size, disable_bar=disable_bar) - + logging.getLogger().info( f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find common gene contexts") - - logging.getLogger().debug(f"Context graph made of {nx.number_of_nodes(gene_context_graph)} nodes and {nx.number_of_edges(gene_context_graph)} edges") + + logging.getLogger().debug( + f"Context graph made of {nx.number_of_nodes(gene_context_graph)} nodes and {nx.number_of_edges(gene_context_graph)} edges") compute_edge_metrics(gene_context_graph, jaccard_threshold) # Filter graph filter_flag = f'is_jaccard_gene_>_{jaccard_threshold}' - - edges_to_remove = [(n,v) for n,v,d in gene_context_graph.edges(data=True) if not d[filter_flag]] + + edges_to_remove = [(n, v) for n, v, d in gene_context_graph.edges(data=True) if not d[filter_flag]] gene_context_graph.remove_edges_from(edges_to_remove) logging.getLogger().debug(f"Filtering context graph on {filter_flag}") - logging.getLogger().debug(f"Context graph made of {nx.number_of_nodes(gene_context_graph)} nodes and {nx.number_of_edges(gene_context_graph)} edges") + logging.getLogger().debug( + f"Context graph made of {nx.number_of_nodes(gene_context_graph)} nodes and {nx.number_of_edges(gene_context_graph)} edges") gene_contexts = get_gene_contexts(gene_context_graph, families_of_interest) - gene_context_graph = make_graph_writable(gene_context_graph) out_graph_file = write_graph(gene_context_graph, output, graph_format) if len(gene_contexts) != 0: - logging.getLogger().info(f"There are {sum((len(gc) for gc in gene_contexts))} families among {len(gene_contexts)} gene contexts") - - output_file = os.path.join(output, "gene_contexts.tsv") + logging.getLogger().info( + f"There are {sum((len(gc) for gc in gene_contexts))} families among {len(gene_contexts)} gene contexts") - export_context_to_dataframe(gene_contexts, fam_2_seq, output_file) + output_file = output / "gene_contexts.tsv" + + export_context_to_dataframe(gene_contexts, fam2seq, output_file) else: logging.getLogger("PPanGGOLiN").info("No gene contexts were found") @@ -164,25 +172,26 @@ def get_gene_contexts(context_graph: nx.Graph, families_of_interest: Set[GeneFam # Connected component graph Filtering # remove singleton famillies - connected_components = (component for component in connected_components if len(component) > 1) + connected_components = (component for component in connected_components if len(component) > 1) # remove component made only of famillies not initially requested connected_components = (component for component in connected_components if component & families_of_interest) gene_contexts = set() families_in_context = set() - + for i, component in enumerate(connected_components): families_in_context |= component family_of_interest_of_gc = component & families_of_interest gene_context = GeneContext(gc_id=i, families=component, families_of_interest=family_of_interest_of_gc) - + # add gc id to node attribute - node_attributes = {n:{"gene_context_id":i, "families_of_interest": n in families_of_interest} for n in component} + node_attributes = {n: {"gene_context_id": i, "families_of_interest": n in families_of_interest} for n in + component} nx.set_node_attributes(context_graph, node_attributes) gene_contexts.add(gene_context) - + node_not_in_context = set(context_graph.nodes()) - families_in_context context_graph.remove_nodes_from(node_not_in_context) @@ -190,67 +199,64 @@ def get_gene_contexts(context_graph: nx.Graph, families_of_interest: Set[GeneFam def make_graph_writable(context_graph): - """ + The original context graph contains ppanggolin objects as nodes and lists and dictionaries in edge attributes. + Since these objects cannot be written to the output graph, + this function creates a new graph that contains only writable objects. - The original context graph contains - ppanggolin objects as nodes and lists and dictionaries in edge attributes. Since these objects - cannot be written to the output graph, this function creates a new graph that contains only - writable objects. - - :param gene_contexts: List of gene context. it includes graph of the context - + :param context_graph: List of gene context. it includes graph of the context """ - - def filter_attribute(data:dict): + def filter_attribute(data: dict): """ Helper function to filter the edge attributes. :param data: The edge attribute data. :return: A filtered dictionary containing only non-collection attributes. """ - return {k:v for k, v in data.items() if type(v) not in [set, dict, list]} + return {k: v for k, v in data.items() if type(v) not in [set, dict, list]} G = nx.Graph() - G.add_edges_from((f1.name, f2.name, filter_attribute(d)) for f1, f2, d in context_graph.edges(data=True)) - + G.add_edges_from((f1.name, f2.name, filter_attribute(d)) for f1, f2, d in context_graph.edges(data=True)) # convert transitivity dict to str - edges_with_transitivity_str = {(f1.name, f2.name):str(d['transitivity']) for f1, f2, d in context_graph.edges(data=True)} + edges_with_transitivity_str = {(f1.name, f2.name): str(d['transitivity']) for f1, f2, d in + context_graph.edges(data=True)} nx.set_edge_attributes(G, edges_with_transitivity_str, name="transitivity") - nodes_attributes_filtered = {f.name:filter_attribute(d) for f,d in context_graph.nodes(data=True)} + nodes_attributes_filtered = {f.name: filter_attribute(d) for f, d in context_graph.nodes(data=True)} # on top of attributes already contained in node of context graph # add organisms and genes count that have the family, the partition and if the family was in initially requested - nodes_family_data = {f.name:{"organisms": f.number_of_organisms, - "partition": f.named_partition, - "genes": f.number_of_genes} for f in context_graph.nodes()} - + nodes_family_data = {f.name: {"organisms": f.number_of_organisms, + "partition": f.named_partition, + "genes": f.number_of_genes} for f in context_graph.nodes()} + for f, d in G.nodes(data=True): d.update(nodes_family_data[f]) d.update(nodes_attributes_filtered[f]) return G -def write_graph(G:nx.Graph, output_dir: str, graph_format:str): + +def write_graph(G: nx.Graph, output_dir: Path, graph_format: str): """ Write a graph to file in the GraphML format or/and in GEXF format. + :param G: Graph to write :param output_dir: The output directory where the graph file will be written. :param graph_format: Formats of the output graph. Can be graphml or gexf """ if "graphml" == graph_format: - out_file = os.path.join(output_dir, "graph_context.graphml") + out_file = output_dir / "graph_context.graphml" logging.info(f'Writting context graph in {out_file}') nx.write_graphml_lxml(G, out_file) elif "gexf" == graph_format: - out_file = os.path.join(output_dir, "graph_context.gexf") + out_file = output_dir / "graph_context.gexf" logging.info(f'Writting context graph in {out_file}') nx.readwrite.gexf.write_gexf(G, out_file) else: @@ -258,6 +264,7 @@ def write_graph(G:nx.Graph, output_dir: str, graph_format:str): return out_file + def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff: float) -> None: """ Compute various metrics on the edges of the context graph. @@ -267,25 +274,27 @@ def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff: float) """ # compute jaccard on organism and on genes for f1, f2, data in context_graph.edges(data=True): - - data['jaccard_organism'] = len(data['organisms'])/len(set(f1.organisms) | set(f2.organisms)) - - f1_gene_proportion = len(data['genes'][f1])/f1.number_of_genes - f2_gene_proportion = len(data['genes'][f2])/f2.number_of_genes - - data[f'f1'] = f1.name - data[f'f2'] = f2.name - data[f'f1_jaccard_gene'] = f1_gene_proportion - data[f'f2_jaccard_gene'] = f2_gene_proportion - - data[f'is_jaccard_gene_>_{gene_proportion_cutoff}'] = (f1_gene_proportion >= gene_proportion_cutoff) and (f2_gene_proportion >= gene_proportion_cutoff) + data['jaccard_organism'] = len(data['organisms']) / len(set(f1.organisms) | set(f2.organisms)) + + f1_gene_proportion = len(data['genes'][f1]) / f1.number_of_genes + f2_gene_proportion = len(data['genes'][f2]) / f2.number_of_genes + + data['f1'] = f1.name + data['f2'] = f2.name + data['f1_jaccard_gene'] = f1_gene_proportion + data['f2_jaccard_gene'] = f2_gene_proportion + + data[f'is_jaccard_gene_>_{gene_proportion_cutoff}'] = (f1_gene_proportion >= gene_proportion_cutoff) and ( + f2_gene_proportion >= gene_proportion_cutoff) transitivity_counter = data['transitivity'] - mean_transitivity = sum((transitivity*counter for transitivity, counter in transitivity_counter.items()))/sum((counter for counter in transitivity_counter.values())) + mean_transitivity = sum( + (transitivity * counter for transitivity, counter in transitivity_counter.items())) / sum( + (counter for counter in transitivity_counter.values())) data['mean_transitivity'] = mean_transitivity - + # the following commented out lines are additional metrics that could be used # data['min_jaccard_organism'] = len(data['organisms'])/min(len(f1.organisms), len(f2.organisms)) @@ -297,7 +306,7 @@ def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff: float) def add_edges_to_context_graph(context_graph: nx.Graph, - contig_genes: Iterable[Gene], + contig_genes: List[Gene], contig_windows: List[Tuple[int, int]], transitivity: int, is_circular: bool): @@ -314,7 +323,7 @@ def add_edges_to_context_graph(context_graph: nx.Graph, for window_start, window_end in contig_windows: for gene_index in range(window_start, window_end + 1): gene = contig_genes[gene_index] - next_genes = get_n_next_genes_index(gene_index, next_genes_count=transitivity+1, + next_genes = get_n_next_genes_index(gene_index, next_genes_count=transitivity + 1, contig_size=len(contig_genes), is_circular=is_circular) next_genes = list(next_genes) @@ -324,26 +333,25 @@ def add_edges_to_context_graph(context_graph: nx.Graph, # next_gene_index is not in any range of genes in the context # so it is ignored along with all following genes break - + next_gene = contig_genes[next_gene_index] if next_gene.family == gene.family: # If the next gene has the same family, the two genes refer to the same node # so they are ignored continue - + context_graph.add_edge(gene.family, next_gene.family) edge_dict = context_graph[gene.family][next_gene.family] if i == 0: edge_dict['adjacent_family'] = True - + # Store information of the transitivity used to link the two genes: if "transitivity" not in edge_dict: - edge_dict['transitivity'] = {i:0 for i in range(transitivity +1)} + edge_dict['transitivity'] = {i: 0 for i in range(transitivity + 1)} edge_dict['transitivity'][i] += 1 - # Add node attributes node_gene_dict = context_graph.nodes[gene.family] next_gene_gene_dict = context_graph.nodes[next_gene.family] @@ -354,22 +362,21 @@ def add_edges_to_context_graph(context_graph: nx.Graph, add_val_to_dict_attribute(node_gene_dict, "genes", gene) add_val_to_dict_attribute(next_gene_gene_dict, "genes", next_gene) - # Add edge attributes edge_dict = context_graph[gene.family][next_gene.family] try: genes_edge_dict = edge_dict['genes'] - except: + except Exception: genes_edge_dict = {} edge_dict['genes'] = genes_edge_dict - + add_val_to_dict_attribute(genes_edge_dict, gene.family, gene) add_val_to_dict_attribute(genes_edge_dict, next_gene.family, next_gene) add_val_to_dict_attribute(edge_dict, "organisms", gene.organism) increment_attribute_counter(edge_dict, "gene_pairs") - + assert gene.organism == next_gene.organism, f"Gene of the same contig have a different organism. {gene.organism} and {next_gene.organism}" @@ -389,7 +396,7 @@ def add_val_to_dict_attribute(attr_dict: dict, attribute_key, attribute_value): attr_dict[attribute_key] = {attribute_value} -def increment_attribute_counter(edge_dict: dict, key:Hashable): +def increment_attribute_counter(edge_dict: dict, key: Hashable): """ Increment the counter for an edge/node attribute in the edge/node dictionary. @@ -404,7 +411,8 @@ def increment_attribute_counter(edge_dict: dict, key:Hashable): edge_dict[key] = 1 -def get_n_next_genes_index(current_index: int, next_genes_count: int, contig_size: int, is_circular: bool = False) -> Iterator[int]: +def get_n_next_genes_index(current_index: int, next_genes_count: int, + contig_size: int, is_circular: bool = False) -> Iterator[int]: """ Generate the indices of the next genes based on the current index and contig properties. @@ -424,17 +432,18 @@ def get_n_next_genes_index(current_index: int, next_genes_count: int, contig_siz raise IndexError(f'current gene index is out of range. ' f"Contig has {contig_size} genes while the given gene index is {current_index}") if is_circular: - next_genes = chain(range(current_index+1, contig_size), range(0, current_index)) + next_genes = chain(range(current_index + 1, contig_size), range(0, current_index)) else: - next_genes = range(current_index+1, contig_size) - + next_genes = range(current_index + 1, contig_size) + for i, next_gene_index in enumerate(next_genes): if i == next_genes_count: break yield next_gene_index - -def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int], window_size: int, is_circular:bool = False): + +def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int], window_size: int, + is_circular: bool = False): """ Extracts contiguous windows around positions of interest within a contig. @@ -450,7 +459,7 @@ def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int] sorted_positions = sorted(positions_of_interest) # Check if any position of interest is out of range - if sorted_positions[0] <0 or sorted_positions[-1] >= contig_size: + if sorted_positions[0] < 0 or sorted_positions[-1] >= contig_size: raise IndexError(f'Positions of interest are out of range. ' f"Contig has {contig_size} genes while given min={sorted_positions[0]} & max={sorted_positions[-1]} positions") @@ -462,33 +471,34 @@ def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int] # the returned window are always checked that its positions are not out of range... # so there's no chance to find an out of scope position in final list if first_position - window_size < 0: - out_of_scope_position = (contig_size ) + first_position + out_of_scope_position = contig_size + first_position sorted_positions.append(out_of_scope_position) - - if last_position + window_size >= contig_size : + + if last_position + window_size >= contig_size: out_of_scope_position = last_position - contig_size sorted_positions.insert(0, out_of_scope_position) - + start_po = max(sorted_positions[0] - window_size, 0) - + for position, next_po in zip_longest(sorted_positions, sorted_positions[1:]): - + if next_po is None: # If there are no more positions, add the final window - end_po = min(position + window_size, contig_size-1) + end_po = min(position + window_size, contig_size - 1) windows_coordinates.append((start_po, end_po)) - - elif position + window_size +1 < next_po - window_size: + + elif position + window_size + 1 < next_po - window_size: # If there is a gap between positions, add the current window # and update the start position for the next window - end_po = min(position + window_size, contig_size-1) - + end_po = min(position + window_size, contig_size - 1) + windows_coordinates.append((start_po, end_po)) - + start_po = max(next_po - window_size, 0) - + return windows_coordinates + def get_contig_to_genes(gene_families: Iterable[GeneFamily]) -> Dict[Contig, Set[Gene]]: """ Group genes from specified gene families by contig. @@ -497,7 +507,7 @@ def get_contig_to_genes(gene_families: Iterable[GeneFamily]) -> Dict[Contig, Set :return: A dictionary mapping contigs to sets of genes. """ - + contig_to_genes_of_interest = defaultdict(set) for gene_family in gene_families: for gene in gene_family.genes: @@ -506,7 +516,8 @@ def get_contig_to_genes(gene_families: Iterable[GeneFamily]) -> Dict[Contig, Set return contig_to_genes_of_interest -def compute_gene_context_graph(families: Iterable[GeneFamily], transitive: int = 4, window_size: int = 0, disable_bar: bool = False) -> nx.Graph: +def compute_gene_context_graph(families: Iterable[GeneFamily], transitive: int = 4, window_size: int = 0, + disable_bar: bool = False) -> nx.Graph: """ Construct the graph of gene contexts between families of the pangenome. @@ -519,28 +530,27 @@ def compute_gene_context_graph(families: Iterable[GeneFamily], transitive: int = """ context_graph = nx.Graph() - + contig_to_genes_of_interest = get_contig_to_genes(families) - - for contig, genes_of_interest in tqdm(contig_to_genes_of_interest.items(), unit="contig", total=len(contig_to_genes_of_interest), disable=disable_bar): - + + for contig, genes_of_interest in tqdm(contig_to_genes_of_interest.items(), unit="contig", + total=len(contig_to_genes_of_interest), disable=disable_bar): genes_count = contig.number_of_genes - + genes_of_interest_positions = [g.position for g in genes_of_interest] - contig_windows = extract_contig_window(genes_count, genes_of_interest_positions, - window_size=window_size, is_circular=contig.is_circular) - + contig_windows = extract_contig_window(genes_count, genes_of_interest_positions, + window_size=window_size, is_circular=contig.is_circular) + add_edges_to_context_graph(context_graph, - contig.get_genes(), - contig_windows, - transitive, - contig.is_circular) + contig.get_genes(), + contig_windows, + transitive, + contig.is_circular) return context_graph - -def fam2seq(seq_to_pan: dict) -> dict: +def fam_to_seq(seq_to_pan: dict) -> dict: """ Create a dictionary with gene families as keys and list of sequences id as values @@ -559,12 +569,12 @@ def fam2seq(seq_to_pan: dict) -> dict: return fam_2_seq -def export_context_to_dataframe(gene_contexts: set, fam_to_seq: dict, output: str): +def export_context_to_dataframe(gene_contexts: set, fam2seq: Dict[str, int], output: Path): """ Export the results into dataFrame - :param gene_contexts: connected components found in the pan - :param fam_to_seq: Dictionary with gene families as keys and list of sequence ids as values + :param gene_contexts: connected components found in the pangenome + :param fam2seq: Dictionary with gene families ID as keys and list of sequence ids as values :param output: output path """ @@ -572,20 +582,20 @@ def export_context_to_dataframe(gene_contexts: set, fam_to_seq: dict, output: st for gene_context in gene_contexts: for family in gene_context.families: - if fam_to_seq is None or fam_to_seq.get(family.ID) is None: + if fam2seq.get(family.ID) is None: sequence_id = None else: - sequence_id = ','.join(fam_to_seq.get(family.ID)) + sequence_id = ','.join(fam2seq.get(family.ID)) - family_info = {"GeneContext ID":gene_context.ID, + family_info = {"GeneContext ID": gene_context.ID, "Gene family name": family.name, - "Sequence ID":sequence_id, - "Nb Genomes":family.number_of_organisms, - "Partition": family.named_partition } + "Sequence ID": sequence_id, + "Nb Genomes": family.number_of_organisms, + "Partition": family.named_partition} lines.append(family_info) - + df = pd.DataFrame(lines).set_index("GeneContext ID") - + df = df.sort_values(["GeneContext ID", "Sequence ID"], na_position='last') df.to_csv(output, sep="\t", na_rep='NA') @@ -602,7 +612,7 @@ def launch(args: argparse.Namespace): if not any([args.sequences, args.family]): raise Exception("At least one of --sequences or --family option must be given") - + mk_outdir(args.output, args.force) pangenome = Pangenome() @@ -615,12 +625,13 @@ def launch(args: argparse.Namespace): check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar) - search_gene_context_in_pangenome(pangenome=pangenome, output=args.output, tmpdir=args.tmpdir, sequence_file=args.sequences, families=args.family, transitive=args.transitive, - identity=args.identity, coverage=args.coverage, use_representatives=args.fast, jaccard_threshold=args.jaccard, + identity=args.identity, coverage=args.coverage, use_representatives=args.fast, + jaccard_threshold=args.jaccard, window_size=args.window_size, - no_defrag=args.no_defrag, cpu=args.cpu, disable_bar=args.disable_prog_bar, graph_format=args.graph_format, + no_defrag=args.no_defrag, cpu=args.cpu, disable_bar=args.disable_prog_bar, + graph_format=args.graph_format, translation_table=args.translation_table, keep_tmp=args.keep_tmp) @@ -648,8 +659,9 @@ def parser_context(parser: argparse.ArgumentParser): required = parser.add_argument_group(title="Required arguments", description="All of the following arguments are required :") required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome.h5 file") - required.add_argument('-o', '--output', required=False, type=Path, default="ppanggolin_context" + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", - time.localtime()) + "_PID" + str(os.getpid()), + required.add_argument('-o', '--output', required=False, type=Path, + default="ppanggolin_context" + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", + time.localtime()) + "_PID" + str(os.getpid()), help="Output directory where the file(s) will be written") onereq = parser.add_argument_group(title="Input file", description="One of the following argument is required :") onereq.add_argument('-S', '--sequences', required=False, type=Path, @@ -662,10 +674,10 @@ def parser_context(parser: argparse.ArgumentParser): help="DO NOT Realign gene families to link fragments with" "their non-fragmented gene family.") optional.add_argument("--fast", required=False, action="store_true", - help="Use representative sequences of gene families for input gene alignment. " - "This option is recommended for faster processing but may be less sensitive. " - "By default, all pangenome genes are used for alignment. " - "This argument makes sense only when --sequence is provided.") + help="Use representative sequences of gene families for input gene alignment. " + "This option is recommended for faster processing but may be less sensitive. " + "By default, all pangenome genes are used for alignment. " + "This argument makes sense only when --sequence is provided.") optional.add_argument('--identity', required=False, type=float, default=0.8, help="min identity percentage threshold") optional.add_argument('--coverage', required=False, type=float, default=0.8, @@ -677,18 +689,19 @@ def parser_context(parser: argparse.ArgumentParser): "non related genes allowed in-between two related genes. Increasing it will improve " "precision but lower sensitivity a little.") optional.add_argument("-w", "--window_size", required=False, type=int, default=5, - help="Number of neighboring genes that are considered on each side of " - "a gene of interest when searching for conserved genomic contexts.") - + help="Number of neighboring genes that are considered on each side of " + "a gene of interest when searching for conserved genomic contexts.") + optional.add_argument("-s", "--jaccard", required=False, type=restricted_float, default=0.85, help="minimum jaccard similarity used to filter edges between gene families. Increasing it " "will improve precision but lower sensitivity a lot.") - optional.add_argument('--graph_format', help="Format of the context graph. Can be gexf or graphml.", default='graphml', choices=['gexf','graphml']) + optional.add_argument('--graph_format', help="Format of the context graph. Can be gexf or graphml.", + default='graphml', choices=['gexf', 'graphml']) optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", - help="Keeping temporary files (useful for debugging).") + help="Keeping temporary files (useful for debugging).") if __name__ == '__main__': diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 2c8cde9c..ef2e13b0 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -438,7 +438,9 @@ def remove(self, position): del self[position] def get_genes(self, begin: int = 0, end: int = None) -> List[Gene]: - """Gets a list of genes within a range. If argument is given it return all genes. + """ + Gets a list of genes within a range. + If no arguments are given it return all genes. :param begin: Position of the first gene to retrieve :param end: Position of the last gene to not retrieve diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 695ae608..6365e408 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -737,17 +737,24 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): class GeneContext: """ - A class used to represent a gene context which is a collection of gene families related to a specific genomic context.. + Represent a gene context which is a collection of gene families related to a specific genomic context.. - :param gc_id: Identifier of the gene context. - :param families: Gene families related to the gene context. - :param families_of_interest: Input families for which the context is being searched. + Methods + - families: Generator that yields all the gene families in the gene context. + - add_context_graph: Add a context graph corresponding to the gene context. + - add_family: Add a gene family to the gene context. + + Fields + - gc_id: The identifier of the gene context. + - graph: context graph corresponding to the gene context """ def __init__(self, gc_id: int, families: Set[GeneFamily] = None, families_of_interest: Set[GeneFamily] = None): """Constructor method - :param gc_id : Identifier of the Gene context - :param families: Gene families related to the GeneContext + + :param gc_id: Identifier of the gene context. + :param families: Gene families related to the gene context. + :param families_of_interest: Input families for which the context is being searched. """ if not isinstance(gc_id, int): @@ -756,7 +763,7 @@ def __init__(self, gc_id: int, families: Set[GeneFamily] = None, families_of_int self.ID = gc_id self._families_getter = {} self.families_of_interest = families_of_interest - self.graph = None + self._graph = None if families is not None: if not all(isinstance(fam, GeneFamily) for fam in families): raise Exception("You provided elements that were not GeneFamily objects. " @@ -846,15 +853,23 @@ def __delitem__(self, name): except KeyError: raise KeyError(f"There isn't gene family with the name {name} in the gene context") - def add_context_graph(self, graph: nx.Graph): + @property + def graph(self): + if self._graph is None: + raise ValueError("Graph has not been added to the context") + return self._graph + + @graph.setter + def graph(self, graph: nx.Graph): """ Add a context graph to the gene context. :param graph: The context graph. """ - self.graph = graph - - + if not isinstance(nx.Graph, graph): + logging.getLogger("PPanGGOLiN").debug(f"given type: {type(graph)}") + raise TypeError("Context graph must be a networkx graph object.") + self._graph = graph @property def families(self) -> Generator[GeneFamily, None, None]: @@ -864,7 +879,6 @@ def families(self) -> Generator[GeneFamily, None, None]: """ yield from self._families_getter.values() - def add_family(self, family: GeneFamily): """ Add a gene family to the gene context. @@ -874,4 +888,4 @@ def add_family(self, family: GeneFamily): if not isinstance(family, GeneFamily): raise Exception("You did not provide a GeneFamily object. " "GeneContexts are only made of GeneFamily objects.") - self.families.add(family) \ No newline at end of file + self[family.name] = family From 49d3cdf40ef6074b893fc44c475d1a44bb706331 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 13 Oct 2023 11:17:55 +0200 Subject: [PATCH 148/173] Fix bug in writting dict --- VERSION | 2 +- ppanggolin/context/searchGeneContext.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/VERSION b/VERSION index a11c24f7..e805773e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.192 +1.2.193 diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 1ca725c7..2ed6ae06 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -95,7 +95,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: project_and_write_partition(seqid2fam, seq_set, output) write_gene_to_gene_family(seqid2fam, seq_set, output) - fam2seq = {gf.ID: seqid for seqid, gf in seqid2fam} + fam2seq = {gf.ID: seqid for seqid, gf in seqid2fam.items()} for pan_family in seqid2fam.values(): families_of_interest.add(pan_family) From 382b18f41b90bdfdf29154cb6fe4c0de451e8342 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 13 Oct 2023 11:56:02 +0200 Subject: [PATCH 149/173] Fix problem with len generator --- VERSION | 2 +- ppanggolin/figures/draw_spot.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/VERSION b/VERSION index 0da01cf8..1b3db87e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.187 +1.2.188 diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index bd80b2a3..8a603a02 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -648,7 +648,7 @@ def draw_spots(pangenome: Pangenome, output: Path, spot_list: str, disable_bar: if spot_list == 'all' or any(x == 'all' for x in spot_list): logging.getLogger("PPanGGOLiN").debug(f"'all' value is found in spot list, all spots are drawn.") - selected_spots = pangenome.spots + selected_spots = list(pangenome.spots) elif spot_list == "synteny" or any(x == 'synteny' for x in spot_list): logging.getLogger().debug(f"'synteny' value is found in spot list, all spots with more than 1 conserved synteny are drawn.") selected_spots = [s for s in pangenome.spots if len(s.get_uniq_ordered_set()) > 1] From 25b39ce6fde6e7f22a450bd0683578f904501ebe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 13 Oct 2023 11:56:38 +0200 Subject: [PATCH 150/173] Fix problem to write RGP, spot, module for gene without them --- VERSION | 2 +- ppanggolin/formats/writeFlat.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/VERSION b/VERSION index 1b3db87e..3cff2901 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.188 +1.2.189 diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 781f7c2f..efca6621 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -588,14 +588,13 @@ def write_org_file(org: Organism, output: Path, compress: bool = False): len(list(gene.family.get_genes_per_org(org))), gene.family.named_partition, nb_pers, nb_shell, nb_cloud] if needRegions: - if gene.RGP is not None: - rgp = gene.RGP.name + rgp = gene.RGP.name if gene.RGP is not None else "" row.append(rgp) if needSpots: - if gene.RGP is not None and gene.RGP.spot is not None: - spot = gene.RGP.spot.ID + spot = gene.RGP.spot.ID if gene.RGP is not None and gene.RGP.spot is not None else "" row.append(spot) if needModules: + module = "" if gene.family.number_of_modules > 0: modules = ','.join(["module_" + str(module.ID) for module in gene.family.modules]) row.append(modules) From 765a24663855b646df0c9af0f7509682d867015e Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 13 Oct 2023 18:05:55 +0200 Subject: [PATCH 151/173] add target family column and fix sequence id --- docs/user/Genomic-context.md | 5 +++-- ppanggolin/context/searchGeneContext.py | 23 ++++++++++++++--------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/docs/user/Genomic-context.md b/docs/user/Genomic-context.md index 671b9e20..96b2e6d0 100644 --- a/docs/user/Genomic-context.md +++ b/docs/user/Genomic-context.md @@ -30,13 +30,14 @@ In this case, you can give a pangenome without gene families representatives seq In case of you are using families ID, you will only have as output the `gene_context.tsv` file. In the other case, you use sequences, you will have another output file to report the alignment between sequences and pangenome families (see detail in align subcommand). -There are 4 columns in `gene_context.tsv`. +There are 6 columns in `gene_context.tsv`. -1. **geneContext ID**: identifier of the found context. It is incrementally generated, beginning with 1 +1. **geneContext ID**: Identifier of the found context. It is incrementally generated, beginning with 1 2. **Gene family name**: Identifier of the gene family, from the pangenome, correspond to the found context 3. **Sequence ID**: Identifier of the searched sequence in the pangenome 4. **Nb Genomes**: Number of genomes where the genomic context is found 5. **Partition**: Partition of the gene family corresponding to the found context +6. **Target family**: Whether the family is a target family, meaning it matches an input sequence, or a family provided as input. In **sequence Id**, it is possible to find a NA value. This case, correspond to another gene family found in the context. diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 2ed6ae06..18fe83ce 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -66,7 +66,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) families_of_interest = set() - fam2seq = {} + family_2_input_seqid = None if sequence_file is not None: # Alignment of sequences on pangenome families with read_compressed_or_not(sequence_file) as seqFileObj: @@ -95,7 +95,11 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: project_and_write_partition(seqid2fam, seq_set, output) write_gene_to_gene_family(seqid2fam, seq_set, output) - fam2seq = {gf.ID: seqid for seqid, gf in seqid2fam.items()} + + family_2_input_seqid = defaultdict(set) + for seqid, gf in seqid2fam.items(): + family_2_input_seqid[gf].add(seqid) + for pan_family in seqid2fam.values(): families_of_interest.add(pan_family) @@ -140,8 +144,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: f"There are {sum((len(gc) for gc in gene_contexts))} families among {len(gene_contexts)} gene contexts") output_file = output / "gene_contexts.tsv" - - export_context_to_dataframe(gene_contexts, fam2seq, output_file) + export_context_to_dataframe(gene_contexts, family_2_input_seqid, families_of_interest, output_file) else: logging.getLogger("PPanGGOLiN").info("No gene contexts were found") @@ -569,29 +572,31 @@ def fam_to_seq(seq_to_pan: dict) -> dict: return fam_2_seq -def export_context_to_dataframe(gene_contexts: set, fam2seq: Dict[str, int], output: Path): +def export_context_to_dataframe(gene_contexts: set, fam2seq: Dict[str, int], families_of_interest: Set[GeneFamily], output: Path): """ Export the results into dataFrame :param gene_contexts: connected components found in the pangenome :param fam2seq: Dictionary with gene families ID as keys and list of sequence ids as values + :param families_of_interest: families of interest that are at the origine of the context. :param output: output path """ lines = [] for gene_context in gene_contexts: for family in gene_context.families: - - if fam2seq.get(family.ID) is None: + if fam2seq.get(family) is None: sequence_id = None else: - sequence_id = ','.join(fam2seq.get(family.ID)) + sequence_id = ','.join(fam2seq.get(family)) family_info = {"GeneContext ID": gene_context.ID, "Gene family name": family.name, "Sequence ID": sequence_id, "Nb Genomes": family.number_of_organisms, - "Partition": family.named_partition} + "Partition": family.named_partition, + "Target family": family in families_of_interest} + lines.append(family_info) df = pd.DataFrame(lines).set_index("GeneContext ID") From b878a4c649ca45657ae90ce53c29dfd19f5dd43f Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 13 Oct 2023 18:30:07 +0200 Subject: [PATCH 152/173] fix famseq wrong None type --- ppanggolin/context/searchGeneContext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 18fe83ce..fc3ded05 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -66,7 +66,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) families_of_interest = set() - family_2_input_seqid = None + family_2_input_seqid = {} if sequence_file is not None: # Alignment of sequences on pangenome families with read_compressed_or_not(sequence_file) as seqFileObj: From 32bd30af0dde50c8a5f2ef7fddc4438cdc82c3d4 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 17 Oct 2023 18:45:03 +0200 Subject: [PATCH 153/173] remove the need of a proksee template --- ppanggolin/formats/proksee_template.json | 74 ------------ ppanggolin/formats/write_proksee.py | 137 +++++++++++++---------- 2 files changed, 80 insertions(+), 131 deletions(-) delete mode 100644 ppanggolin/formats/proksee_template.json diff --git a/ppanggolin/formats/proksee_template.json b/ppanggolin/formats/proksee_template.json deleted file mode 100644 index a6952dbd..00000000 --- a/ppanggolin/formats/proksee_template.json +++ /dev/null @@ -1,74 +0,0 @@ -{ - "cgview": { - "settings":{ - "format":"circular", - "geneticCode":11, - "backgroundColor":"rgba(255,255,255,1)", - "showShading":true, - "arrowHeadLength":0.3, - "minArcLength":1, - "initialMapThicknessProportion":0.1, - "maxMapThicknessProportion":0.5}, - "backbone":{"color":"rgba(128,128,128,1)", - "colorAlternate":"rgba(200,200,200,1)", - "thickness":5, - "decoration":"arrow" - }, - "ruler": { - "font":"sans-serif,plain,10", - "color":"rgba(0,0,0,1)"}, - "annotation":{"font":"monospace,plain,12", - "onlyDrawFavorites":false, - "visible":true - }, - "dividers":{ - "track": { - "color": "rgba(50,50,50,1)", - "thickness": 1, - "spacing": 1 - }, - "slot":{ - "visible":true, - "color":"rgba(0,0,0,1)", - "thickness":1, - "spacing":1 - } - }, - "highlighter":{ - "visible":true - }, - "captions":[ - { - "position":"bottom-center", - "textAlignment":"center", - "font":"sans-serif,plain,24", - "fontColor":"rgba(0,0,0,1)", - "backgroundColor":"rgba(255,255,255,0.4)" - } - ], - "legend":{ - "position":"top-right", - "textAlignment":"left", - "defaultFont":"sans-serif,plain,14", - "defaultFontColor":"rgba(0,0,0,1)", - "backgroundColor":"rgba(255,255,255,0.75)" - }, - "sequence": { - "color": "rgb(0,0,0)", - "font": "sans-serif,plain,14" - }, - "bookmarks": [ - { - "bbOffset": 86.75, - "bp": 5617, - "favorite": false, - "format": "circular", - "name": "Bookmark-1", - "shortcut": "1", - "zoom": 1.685 - } - ], - "plots": [ - ] - } -} \ No newline at end of file diff --git a/ppanggolin/formats/write_proksee.py b/ppanggolin/formats/write_proksee.py index d87b43f1..5af353e7 100644 --- a/ppanggolin/formats/write_proksee.py +++ b/ppanggolin/formats/write_proksee.py @@ -11,7 +11,9 @@ from tqdm import tqdm from typing import Dict, List, Tuple, Set from uuid import uuid4 -from itertools import cycle +from itertools import cycle +from collections import defaultdict + # installed libraries from bokeh.palettes import Category20 from plotly.express.colors import qualitative @@ -38,11 +40,11 @@ def read_settings(settings_data: dict): settings_data["geneticCode"] = "11" -def write_legend_items(legend_data: dict, features: List[str], modules: Set[Module]): #, sources: List[str]): +def write_legend_items(features: List[str]):#, modules: Set[Module]): #, sources: List[str]): - colors = palette() + # colors = palette() # use https://medialab.github.io/iwanthue/ to find nice colors # that associate well with established partition colors (orange, light green, light blue) main_colors = { @@ -54,29 +56,31 @@ def write_legend_items(legend_data: dict, features: List[str], modules: Set[Modu "dark red": "#ca5c55", } - - legend_data["items"] = [ + legend_data = {"items" : [ {"name": "persistent", "swatchColor": main_colors['orange'], "decoration": "arrow"}, {"name": "shell", "swatchColor": main_colors['light green'], "decoration": "arrow"}, {"name": "cloud", "swatchColor": main_colors['light blue'], "decoration": "arrow"}, - {"name": "RNA", "swatchColor": main_colors['purple'], "decoration": "arrow"},] + {"name": "RNA", "swatchColor": main_colors['purple'], "decoration": "arrow"}, + ] + } if "rgp" in features or "all" in features: legend_data["items"].append({"name": "RGP", "swatchColor": main_colors['dark green'], "decoration": "arc"}), + # if "spots" in features or "all" in features: # legend_data["items"].append({"name": "Spot", "swatchColor": main_colors['dark red'], 1)", "decoration": "arc"}) if "modules" in features or "all" in features: - if modules is None: - legend_data["items"].append({"name": "Module", "swatchColor": main_colors['dark red'], "decoration": "arc"}) - else: - for mod in modules: - legend_data["items"].append({"name": f"module_{mod.ID}", "decoration": "arc", "swatchColor": next(colors)}) - + # if modules is None: + legend_data["items"].append({"name": "Module", "swatchColor": main_colors['dark red'], "decoration": "arc"}) + # else: + # for mod in modules: + # legend_data["items"].append({"name": f"module_{mod.ID}", "decoration": "arc", "swatchColor": next(colors)}) + return legend_data def write_tracks(features: List[str]): tracks = [{"name": "Gene", "separateFeaturesBy": "strand", "position": "outside", "thicknessRatio": 1, - "dataType": "feature", "dataMethod": "source", "dataKeys": "Gene"}, # {"name": "Partition", "separateFeaturesBy": "strand", "position": "outside", "thicknessRatio": 1, "dataType": "feature", "dataMethod": "source", "dataKeys": "partition"} + "dataType": "feature", "dataMethod": "source", "dataKeys": "Gene"} ] if "rgp" in features or "all" in features: @@ -122,8 +126,37 @@ def read_data(template: Path, features: List[str], modules: List[str] = None) -> proksee_data["cgview"]["tracks"] = write_tracks(features) return proksee_data +def initiate_proksee_data(features, org_name): + """ + + """ + + proksee_legends = write_legend_items(features) + + proksee_tracks = write_tracks(features) + + proksee_captions = { + "name": f"{org_name} annotated with PPanGGOLiN", + "position": "bottom-center", + "font": "sans-serif,plain,18", + "backgroundColor": "rgba(255,255,255,0.4)" + } + + cgview_data = {"name": "PPanGGOLiN annotations at genome levels", + "version": "1.5.0", + 'settings':{}, + "legend":proksee_legends, + "tracks":proksee_tracks, + "sequence":{}, + 'captions':[proksee_captions], + } + + return {"cgview":cgview_data} + def write_contig(organism: Organism, genome_sequences): + """ + """ contigs_data_list = [] for contig in tqdm(organism.contigs, unit="contig", disable=True): contig_info = {"name": contig.name, @@ -136,19 +169,15 @@ def write_contig(organism: Organism, genome_sequences): return contigs_data_list -def write_genes(organism: Organism, sources: List[str]=None): +def write_genes(organism: Organism, disable_bar=True): genes_data_list = [] - gf2gene = {} + gf2gene = defaultdict(list) - for gene in tqdm(organism.genes, total=organism.number_of_genes(), unit="genes", disable=True): + for gene in tqdm(organism.genes, total=organism.number_of_genes(), unit="genes", disable=disable_bar): gf = gene.family - if gf.name in gf2gene: - gf2gene[gf.name].append(gene) - else: - gf2gene[gf.name] = [gene] - # annotations = {source: "|".join(list(map(str, gf.get_source(source)))) for source in gf.sources if - # source in sources} + gf2gene[gf.name].append(gene) + genes_data_list.append({"name": gene.name, "type": "Gene", "contig": gene.contig.name, @@ -162,7 +191,7 @@ def write_genes(organism: Organism, sources: List[str]=None): "meta": ""#annotations }) - for gene in tqdm(organism.rna_genes, total=organism.number_of_rnas(), unit="rnas", disable=True): + for gene in tqdm(organism.rna_genes, total=organism.number_of_rnas(), unit="rnas", disable=disable_bar): genes_data_list.append({"name": gene.name, "type": "Gene", @@ -176,16 +205,13 @@ def write_genes(organism: Organism, sources: List[str]=None): "legend": "RNA", "meta": ""#annotations }) - - + return genes_data_list, gf2gene def write_partition(organism: Organism): partition_data_list = [] - c=0 for gene in tqdm(organism.genes, total=organism.number_of_genes(), unit="genes", disable=True): - c += 1 partition_data_list.append({"name": gene.family.name, "presence": gene.family.named_partition, "contig": gene.contig.name, @@ -195,7 +221,6 @@ def write_partition(organism: Organism): "legend": gene.family.named_partition, "tags": ["partition"]}) - return partition_data_list @@ -266,25 +291,23 @@ def write_modules(pangenome: Pangenome, organism: Organism, gf2genes: Dict[str, def write_proksee_organism(pangenome: Pangenome, organism: Organism, output: Path, template: Path, features: List[str] = None, modules: List[str] = None, genome_sequences= None): - proksee_data = read_data(template=template, features=features, modules=None) - - if "name" not in proksee_data["cgview"]["captions"]: - proksee_data["cgview"]["captions"][0]["name"] = f"{organism.name} annotated with PPanGGOLiN" + proksee_data = initiate_proksee_data(features, organism.name) + proksee_data["cgview"]["sequence"]["contigs"] = write_contig(organism, genome_sequences) - if "features" not in proksee_data["cgview"]: - proksee_data["cgview"]["features"] = [] - genes_features, gf2genes = write_genes(organism, sources=None) + genes_features, gf2genes = write_genes(organism) - proksee_data["cgview"]["features"] += genes_features - proksee_data["cgview"]["features"] += write_partition(organism) + proksee_data["cgview"]["features"] = genes_features + # proksee_data["cgview"]["features"] += write_partition(organism) if "rgp" in features or "all" in features: proksee_data["cgview"]["features"] += write_rgp(pangenome=pangenome, organism=organism) + # if "spots" in features or "all" in features: # proksee_data["cgview"]["features"] += write_spots(pangenome=pangenome, organism=organism, gf2genes=gf2genes) + if "modules" in features or "all" in features: proksee_data["cgview"]["features"] += write_modules(pangenome=pangenome, organism=organism, gf2genes=gf2genes) @@ -293,23 +316,23 @@ def write_proksee_organism(pangenome: Pangenome, organism: Organism, output: Pat json.dump(proksee_data, out_json, indent=2) -def write_proksee(pangenome: Pangenome, output: Path, features: List[str] = None, sources: List[str] = None, - template: Path = None, organisms_list: List[str] = None, threads: int = 1, disable_bar: bool = False): - assert features is not None - if template is None: - template = Path(__file__).parent.joinpath("proksee_template").with_suffix(".json") - if organisms_list is not None: - organisms = [organism for organism in pangenome.organisms if organism.name in organisms_list] - else: - organisms = pangenome.organisms - with ThreadPoolExecutor(max_workers=threads) as executor: - with tqdm(total=len(organisms), unit='organism', disable=disable_bar) as progress: - futures = [] - for organism in organisms: - future = executor.submit(write_proksee_organism, pangenome, organism, output, - template, features, sources) - future.add_done_callback(lambda p: progress.update()) - futures.append(future) - - for future in futures: - future.result() +# def write_proksee(pangenome: Pangenome, output: Path, features: List[str] = None, sources: List[str] = None, +# template: Path = None, organisms_list: List[str] = None, threads: int = 1, disable_bar: bool = False): +# assert features is not None +# if template is None: +# template = Path(__file__).parent.joinpath("proksee_template").with_suffix(".json") +# if organisms_list is not None: +# organisms = [organism for organism in pangenome.organisms if organism.name in organisms_list] +# else: +# organisms = pangenome.organisms +# with ThreadPoolExecutor(max_workers=threads) as executor: +# with tqdm(total=len(organisms), unit='organism', disable=disable_bar) as progress: +# futures = [] +# for organism in organisms: +# future = executor.submit(write_proksee_organism, pangenome, organism, output, +# template, features, sources) +# future.add_done_callback(lambda p: progress.update()) +# futures.append(future) + +# for future in futures: +# future.result() From a4795fec1e3a191002e1f210a2d7b311d40970a7 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 18 Oct 2023 17:53:11 +0200 Subject: [PATCH 154/173] color module with different colors according their proximity --- ppanggolin/context/searchGeneContext.py | 61 +--------------- ppanggolin/formats/writeFlat.py | 97 ++++++++++++++++++++++--- ppanggolin/formats/write_proksee.py | 37 +++++----- ppanggolin/utils.py | 58 +++++++++++++++ 4 files changed, 163 insertions(+), 90 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index fc3ded05..535bc9be 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -9,7 +9,7 @@ import logging import os from typing import List, Dict, Tuple, Iterable, Hashable, Iterator, Set -from itertools import zip_longest, chain +from itertools import chain from collections import defaultdict from pathlib import Path @@ -21,7 +21,7 @@ # local libraries from ppanggolin.formats import check_pangenome_info from ppanggolin.genome import Gene, Contig -from ppanggolin.utils import mk_outdir, restricted_float, create_tmpdir, read_compressed_or_not +from ppanggolin.utils import mk_outdir, restricted_float, create_tmpdir, read_compressed_or_not, extract_contig_window from ppanggolin.pangenome import Pangenome from ppanggolin.align.alignOnPang import project_and_write_partition, get_input_seq_to_family_with_rep, \ get_input_seq_to_family_with_all, get_seq_ids @@ -445,63 +445,6 @@ def get_n_next_genes_index(current_index: int, next_genes_count: int, yield next_gene_index -def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int], window_size: int, - is_circular: bool = False): - """ - Extracts contiguous windows around positions of interest within a contig. - - :param contig_size: Number of genes in contig. - :param positions_of_interest: An iterable containing the positions of interest. - :param window_size: The size of the window to extract around each position of interest. - :param is_circular: Indicates if the contig is circular. - :return: Yields tuples representing the start and end positions of each contiguous window. - """ - windows_coordinates = [] - - # Sort the positions of interest - sorted_positions = sorted(positions_of_interest) - - # Check if any position of interest is out of range - if sorted_positions[0] < 0 or sorted_positions[-1] >= contig_size: - raise IndexError(f'Positions of interest are out of range. ' - f"Contig has {contig_size} genes while given min={sorted_positions[0]} & max={sorted_positions[-1]} positions") - - if is_circular: - first_position = sorted_positions[0] - last_position = sorted_positions[-1] - # in a circular contig, if the window of a gene of interest overlaps the end/start of the contig - # an out of scope position is added to the sorted positions to take into account those positions - # the returned window are always checked that its positions are not out of range... - # so there's no chance to find an out of scope position in final list - if first_position - window_size < 0: - out_of_scope_position = contig_size + first_position - sorted_positions.append(out_of_scope_position) - - if last_position + window_size >= contig_size: - out_of_scope_position = last_position - contig_size - sorted_positions.insert(0, out_of_scope_position) - - start_po = max(sorted_positions[0] - window_size, 0) - - for position, next_po in zip_longest(sorted_positions, sorted_positions[1:]): - - if next_po is None: - # If there are no more positions, add the final window - end_po = min(position + window_size, contig_size - 1) - windows_coordinates.append((start_po, end_po)) - - elif position + window_size + 1 < next_po - window_size: - # If there is a gap between positions, add the current window - # and update the start position for the next window - end_po = min(position + window_size, contig_size - 1) - - windows_coordinates.append((start_po, end_po)) - - start_po = max(next_po - window_size, 0) - - return windows_coordinates - - def get_contig_to_genes(gene_families: Iterable[GeneFamily]) -> Dict[Contig, Set[Gene]]: """ Group genes from specified gene families by contig. diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 2d29bcc0..1265b6cc 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -5,6 +5,7 @@ import argparse import logging from multiprocessing import get_context +from itertools import combinations from collections import Counter, defaultdict import logging from typing import TextIO,List, Dict @@ -13,17 +14,23 @@ from importlib.metadata import distribution from statistics import median, mean, stdev import os +import random + +import networkx as nx +from plotly.express.colors import qualitative + # local libraries from ppanggolin.edge import Edge from ppanggolin.geneFamily import GeneFamily from ppanggolin.genome import Organism, Gene, Contig, RNA -from ppanggolin.region import Region, Spot +from ppanggolin.region import Region, Spot, Module from ppanggolin.pangenome import Pangenome -from ppanggolin.utils import write_compressed_or_not, mk_outdir, restricted_float, read_compressed_or_not +from ppanggolin.utils import write_compressed_or_not, mk_outdir, restricted_float, read_compressed_or_not, extract_contig_window from ppanggolin.formats.readBinaries import check_pangenome_info from ppanggolin.formats.write_proksee import write_proksee_organism from ppanggolin.formats.writeSequences import read_genome_file + # global variable to store the pangenome pan = Pangenome() # TODO change to pangenome:Pangenome = Pangenome=() ? needAnnotations = False @@ -649,26 +656,94 @@ def write_proksee(output: Path, compress: bool = False, fasta = None, anno = Non for mod in pan.modules: for org in mod.organisms: org_to_modules[org].add(mod) - + + module_to_colors = manage_module_colors(set(pan.modules)) features = ["all"] - template = Path(__file__).parent.joinpath("proksee_template").with_suffix(".json") - - organism_with_rgp = {rgp.organism for rgp in pan.regions} - - for organism in organism_with_rgp : #pan.organisms: + for organism in pan.organisms: if organisms_file: genome_sequences = read_genome_file(org_dict, organism.name) else: genome_sequences = None - write_proksee_organism(pan, organism, output, - template, features=features, - modules=org_to_modules[organism], + org_module_to_color = {org_mod:module_to_colors[org_mod] for org_mod in org_to_modules[organism]} + + write_proksee_organism(pan, organism, output, features=features, module_to_colors = org_module_to_color, genome_sequences=genome_sequences) + +def manage_module_colors(modules: List[Module], window_size:int=30) -> Dict[Module, str]: + """ + Manages colors for a list of modules based on gene positions and a specified window size. + + :param modules: A list of module objects for which you want to determine colors. + :param window_size: Minimum number of genes between two modules to color them with the same color. + A higher value results in more module colors. + :return: A dictionary that maps each module to its assigned color. + """ + + color_mod_graph = nx.Graph() + color_mod_graph.add_nodes_from((module for module in modules)) + + contig_to_mod_genes = defaultdict(set) + gene_to_module = {} + + for module in modules: + for fam in module.families: + for gene in fam.genes: + contig_to_mod_genes[gene.contig].add(gene) + gene_to_module[gene] = module + + for contig, mod_genes in contig_to_mod_genes.items(): + gene_positions = (gene.position for gene in mod_genes) + contig_windows = extract_contig_window( + contig.number_of_genes, gene_positions, window_size=window_size, is_circular=contig.is_circular + ) + contig_windows = list(contig_windows) + + for (start, end) in contig_windows: + module_in_window = {gene_to_module[gene] for gene in mod_genes if start <= gene.position <= end} + + # Add edges between closely located modules + module_edges = [(mod_a, mod_b) for mod_a, mod_b in combinations(module_in_window, 2)] + color_mod_graph.add_edges_from(module_edges) + + module_to_color_int = nx.coloring.greedy_color(color_mod_graph) + + # If you want to export the graph to see the coloring: + # nx.set_node_attributes(color_mod_graph, color_dict, name="color") + # nx.readwrite.graphml.write_graphml(color_mod_graph, f"module_graph_window_size{window_size}.graphml") + + nb_colors = len(set(module_to_color_int.values())) + logging.getLogger().debug(f"We have found that {nb_colors} colors were necessary to color Modules.") + colors = palette(nb_colors) + + module_to_color = {mod: colors[col_i] for mod, col_i in module_to_color_int.items()} + + return module_to_color + +def palette(nb_colors: int) -> List[str]: + """ + Generates a palette of colors for visual representation. + + :param nb_colors: The number of colors needed in the palette. + + :return: A list of color codes in hexadecimal format. + """ + + # Combine two sets of predefined colors for variety + colors = qualitative.Vivid + qualitative.Safe + if len(colors) < nb_colors: + # Generate random colors if not enough predefined colors are available + random.seed(1) + random_colors = ["#" + ''.join([random.choice('0123456789ABCDEF') for _ in range(6)]) for _ in range(nb_colors - len(colors))] + colors += random_colors + else: + colors = colors[:nb_colors] + + return colors def write_gff(output: str, compress: bool = False): diff --git a/ppanggolin/formats/write_proksee.py b/ppanggolin/formats/write_proksee.py index 5af353e7..c7889c46 100644 --- a/ppanggolin/formats/write_proksee.py +++ b/ppanggolin/formats/write_proksee.py @@ -40,11 +40,10 @@ def read_settings(settings_data: dict): settings_data["geneticCode"] = "11" -def write_legend_items(features: List[str]):#, modules: Set[Module]): #, sources: List[str]): - - - - # colors = palette() +def write_legend_items(features: List[str], module_to_color: Dict[Module, str]):#, modules: Set[Module]): #, sources: List[str]): + """ + + """ # use https://medialab.github.io/iwanthue/ to find nice colors # that associate well with established partition colors (orange, light green, light blue) main_colors = { @@ -66,15 +65,13 @@ def write_legend_items(features: List[str]):#, modules: Set[Module]): #, sources if "rgp" in features or "all" in features: legend_data["items"].append({"name": "RGP", "swatchColor": main_colors['dark green'], "decoration": "arc"}), - # if "spots" in features or "all" in features: - # legend_data["items"].append({"name": "Spot", "swatchColor": main_colors['dark red'], 1)", "decoration": "arc"}) - if "modules" in features or "all" in features: # if modules is None: - legend_data["items"].append({"name": "Module", "swatchColor": main_colors['dark red'], "decoration": "arc"}) + # legend_data["items"].append({"name": "Module", "swatchColor": main_colors['dark red'], "decoration": "arc"}) # else: - # for mod in modules: - # legend_data["items"].append({"name": f"module_{mod.ID}", "decoration": "arc", "swatchColor": next(colors)}) + for mod, color in sorted(module_to_color.items(), key=lambda x: x[0].ID): + legend_data["items"].append({"name": f"module_{mod.ID}", "decoration": "arc", "swatchColor": color, "visible":False}) + return legend_data def write_tracks(features: List[str]): @@ -126,12 +123,12 @@ def read_data(template: Path, features: List[str], modules: List[str] = None) -> proksee_data["cgview"]["tracks"] = write_tracks(features) return proksee_data -def initiate_proksee_data(features, org_name): +def initiate_proksee_data(features, org_name, module_to_color): """ """ - proksee_legends = write_legend_items(features) + proksee_legends = write_legend_items(features, module_to_color) proksee_tracks = write_tracks(features) @@ -266,9 +263,9 @@ def write_spots(pangenome: Pangenome, organism: Organism, gf2genes: Dict[str, Li return spots_data_list -def write_modules(pangenome: Pangenome, organism: Organism, gf2genes: Dict[str, List[Gene]]): +def write_modules(modules: List[Module], organism: Organism, gf2genes: Dict[str, List[Gene]]): modules_data_list = [] - for module in tqdm(pangenome.modules, unit="Module", disable=True): + for module in modules: gf_intersection = set(organism.families) & set(module.families) if gf_intersection: completion = round(len(gf_intersection) / len(set(module.families)), 2) @@ -279,7 +276,7 @@ def write_modules(pangenome: Pangenome, organism: Organism, gf2genes: Dict[str, "start": gene.start, "stop": gene.stop, "contig": gene.contig.name, - "legend": "Module",#f"module_{module.ID}", + "legend":f"module_{module.ID}", "source": "Module", "tags": [], "meta": { @@ -288,10 +285,10 @@ def write_modules(pangenome: Pangenome, organism: Organism, gf2genes: Dict[str, return modules_data_list -def write_proksee_organism(pangenome: Pangenome, organism: Organism, output: Path, template: Path, - features: List[str] = None, modules: List[str] = None, genome_sequences= None): +def write_proksee_organism(pangenome: Pangenome, organism: Organism, output: Path, + features: List[str] = None, module_to_colors: Dict[Module,str] = None, genome_sequences= None): - proksee_data = initiate_proksee_data(features, organism.name) + proksee_data = initiate_proksee_data(features, organism.name, module_to_colors) proksee_data["cgview"]["sequence"]["contigs"] = write_contig(organism, genome_sequences) @@ -309,7 +306,7 @@ def write_proksee_organism(pangenome: Pangenome, organism: Organism, output: Pat # proksee_data["cgview"]["features"] += write_spots(pangenome=pangenome, organism=organism, gf2genes=gf2genes) if "modules" in features or "all" in features: - proksee_data["cgview"]["features"] += write_modules(pangenome=pangenome, organism=organism, gf2genes=gf2genes) + proksee_data["cgview"]["features"] += write_modules(modules=module_to_colors, organism=organism, gf2genes=gf2genes) logging.debug(f"Write proksee for {organism.name}") with open(output.joinpath(organism.name).with_suffix(".json"), "w") as out_json: diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index 7dc81e67..177d5162 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -13,6 +13,7 @@ from contextlib import contextmanager import tempfile import time +from itertools import zip_longest import networkx as nx from importlib.metadata import distribution @@ -935,3 +936,60 @@ def delete_unspecified_args(args: argparse.Namespace): for arg_name, arg_val in args._get_kwargs(): if arg_val is None: delattr(args, arg_name) + + +def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int], window_size: int, + is_circular: bool = False): + """ + Extracts contiguous windows around positions of interest within a contig. + + :param contig_size: Number of genes in contig. + :param positions_of_interest: An iterable containing the positions of interest. + :param window_size: The size of the window to extract around each position of interest. + :param is_circular: Indicates if the contig is circular. + :return: Yields tuples representing the start and end positions of each contiguous window. + """ + windows_coordinates = [] + + # Sort the positions of interest + sorted_positions = sorted(positions_of_interest) + + # Check if any position of interest is out of range + if sorted_positions[0] < 0 or sorted_positions[-1] >= contig_size: + raise IndexError(f'Positions of interest are out of range. ' + f"Contig has {contig_size} genes while given min={sorted_positions[0]} & max={sorted_positions[-1]} positions") + + if is_circular: + first_position = sorted_positions[0] + last_position = sorted_positions[-1] + # in a circular contig, if the window of a gene of interest overlaps the end/start of the contig + # an out of scope position is added to the sorted positions to take into account those positions + # the returned window are always checked that its positions are not out of range... + # so there's no chance to find an out of scope position in final list + if first_position - window_size < 0: + out_of_scope_position = contig_size + first_position + sorted_positions.append(out_of_scope_position) + + if last_position + window_size >= contig_size: + out_of_scope_position = last_position - contig_size + sorted_positions.insert(0, out_of_scope_position) + + start_po = max(sorted_positions[0] - window_size, 0) + + for position, next_po in zip_longest(sorted_positions, sorted_positions[1:]): + + if next_po is None: + # If there are no more positions, add the final window + end_po = min(position + window_size, contig_size - 1) + windows_coordinates.append((start_po, end_po)) + + elif position + window_size + 1 < next_po - window_size: + # If there is a gap between positions, add the current window + # and update the start position for the next window + end_po = min(position + window_size, contig_size - 1) + + windows_coordinates.append((start_po, end_po)) + + start_po = max(next_po - window_size, 0) + + return windows_coordinates \ No newline at end of file From 53dac90b5182289af674f617d0311f05c7723d6a Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 18 Oct 2023 18:12:56 +0200 Subject: [PATCH 155/173] clean proksee generation code and add type and docstring --- ppanggolin/formats/write_proksee.py | 399 +++++++++++++--------------- 1 file changed, 189 insertions(+), 210 deletions(-) diff --git a/ppanggolin/formats/write_proksee.py b/ppanggolin/formats/write_proksee.py index c7889c46..d90e175d 100644 --- a/ppanggolin/formats/write_proksee.py +++ b/ppanggolin/formats/write_proksee.py @@ -2,47 +2,31 @@ # coding:utf-8 # default libraries -from concurrent.futures import ThreadPoolExecutor -from datetime import datetime import json import logging from pathlib import Path -from random import randint from tqdm import tqdm -from typing import Dict, List, Tuple, Set -from uuid import uuid4 -from itertools import cycle +from typing import Dict, List, Tuple from collections import defaultdict # installed libraries -from bokeh.palettes import Category20 -from plotly.express.colors import qualitative -from ppanggolin.genome import Organism, Contig, Gene -from ppanggolin.region import Spot, Module # local libraries +from ppanggolin.genome import Organism, Gene +from ppanggolin.region import Module from ppanggolin.pangenome import Pangenome -def palette() -> List[Tuple[int]]: - palette = qualitative.Vivid + qualitative.Pastel2 + qualitative.Pastel1 + qualitative.Antique + qualitative.Safe + qualitative.Bold - palette = cycle(palette) - return palette - - -def read_settings(settings_data: dict): - if "format" not in settings_data: - settings_data["format"] = "circular" - if "geneticCode" not in settings_data: - # TODO Manage genetic code - settings_data["geneticCode"] = "11" +def write_legend_items(features: List[str], module_to_color: Dict[Module, str]): + """ + Generates legend items based on the selected features and module-to-color mapping. + :param features: A list of features to include in the legend. + :param module_to_color: A dictionary mapping modules to their assigned colors. -def write_legend_items(features: List[str], module_to_color: Dict[Module, str]):#, modules: Set[Module]): #, sources: List[str]): - """ - + :return: A data structure containing legend items based on the selected features and module colors. """ # use https://medialab.github.io/iwanthue/ to find nice colors # that associate well with established partition colors (orange, light green, light blue) @@ -66,270 +50,265 @@ def write_legend_items(features: List[str], module_to_color: Dict[Module, str]): legend_data["items"].append({"name": "RGP", "swatchColor": main_colors['dark green'], "decoration": "arc"}), if "modules" in features or "all" in features: - # if modules is None: - # legend_data["items"].append({"name": "Module", "swatchColor": main_colors['dark red'], "decoration": "arc"}) - # else: for mod, color in sorted(module_to_color.items(), key=lambda x: x[0].ID): legend_data["items"].append({"name": f"module_{mod.ID}", "decoration": "arc", "swatchColor": color, "visible":False}) return legend_data def write_tracks(features: List[str]): + """ + Generates track information based on the selected features. + + :param features: A list of features to include in the ProkSee data. + + :return: A list of track configurations based on the selected features. + """ + tracks = [ + { + "name": "Gene", + "separateFeaturesBy": "strand", + "position": "outside", + "thicknessRatio": 1, + "dataType": "feature", + "dataMethod": "source", + "dataKeys": "Gene" + } + ] - tracks = [{"name": "Gene", "separateFeaturesBy": "strand", "position": "outside", "thicknessRatio": 1, - "dataType": "feature", "dataMethod": "source", "dataKeys": "Gene"} - ] - if "rgp" in features or "all" in features: - tracks.append({"name": "RGP", "separateFeaturesBy": "None", "position": "inside", "thicknessRatio": 1, - "dataType": "feature", "dataMethod": "source", "dataKeys": "RGP"}), - # if "spots" in features or "all" in features: - # tracks.append({"name": "Spots", "separateFeaturesBy": "None", "position": "inside", "thicknessRatio": 1, - # "dataType": "feature", "dataMethod": "source", "dataKeys": "Spot"}) + tracks.append({ + "name": "RGP", + "separateFeaturesBy": "None", + "position": "inside", + "thicknessRatio": 1, + "dataType": "feature", + "dataMethod": "source", + "dataKeys": "RGP" + }) + if "modules" in features or "all" in features: - tracks.append({"name": "Module", "separateFeaturesBy": "None", "position": "inside", "thicknessRatio": 1, - "dataType": "feature", "dataMethod": "source", "dataKeys": "Module"}) + tracks.append({ + "name": "Module", + "separateFeaturesBy": "None", + "position": "inside", + "thicknessRatio": 1, + "dataType": "feature", + "dataMethod": "source", + "dataKeys": "Module" + }) return tracks -def read_data(template: Path, features: List[str], modules: List[str] = None) -> dict: - """ +def initiate_proksee_data(features: List[str], org_name: str, module_to_color: Dict[Module, str]): """ - - with open(template, "r") as template_file: - proksee_data = json.load(template_file) + Initializes ProkSee data structure with legends, tracks, and captions. - now = datetime.now() + :param features: A list of features to include in the ProkSee data. + :param org_name: The name of the organism for which the ProkSee data is being generated. + :param module_to_color: A dictionary mapping modules to their assigned colors. - if "created" in proksee_data["cgview"]: - proksee_data["cgview"]["updated"] = now.strftime("%Y-%m-%d %H:%M:%S") - last_version = proksee_data["cgview"]["version"].split('.') - proksee_data["cgview"]["version"] = ".".join(last_version[:-1] + last_version[-1] + 1) - else: - proksee_data["cgview"]["created"] = now.strftime("%Y-%m-%d %H:%M:%S") - proksee_data["cgview"]["version"] = "1.0" + :return: ProkSee data structure containing legends, tracks, and captions. + """ + proksee_legends = write_legend_items(features, module_to_color) + proksee_tracks = write_tracks(features) - if "name" not in proksee_data["cgview"]: - proksee_data["cgview"]["name"] = "PPanGGOLiN annotations at genome levels" - proksee_data["cgview"]["id"] = uuid4().hex + proksee_captions = { + "name": f"{org_name} annotated with PPanGGOLiN", + "position": "bottom-center", + "font": "sans-serif,plain,18", + "backgroundColor": "rgba(255,255,255,0.4)" + } - read_settings(proksee_data["cgview"]["settings"]) + cgview_data = { + "name": "PPanGGOLiN annotations at genome levels", + "version": "1.5.0", + 'settings': {}, + "legend": proksee_legends, + "tracks": proksee_tracks, + "sequence": {}, + 'captions': [proksee_captions], + } - if "items" not in proksee_data["cgview"]["legend"]: - write_legend_items(proksee_data["cgview"]["legend"], features, modules) + return {"cgview": cgview_data} - if "tracks" not in proksee_data["cgview"]: - proksee_data["cgview"]["tracks"] = write_tracks(features) - return proksee_data -def initiate_proksee_data(features, org_name, module_to_color): +def write_contig(organism: Organism, genome_sequences: Dict[str, str] = None) -> List[Dict]: """ + Writes contig data for a given organism in proksee format. + + :param organism: The organism for which contig data will be written. + :param genome_sequences: A dictionary mapping contig names to their DNA sequences (default: None). + :return: A list of contig data in a structured format. """ + contigs_data_list = [] - proksee_legends = write_legend_items(features, module_to_color) + for contig in tqdm(organism.contigs, unit="contig", disable=True): + contig_info = { + "name": contig.name, + "length": contig.length, + "orientation": "+", + } - proksee_tracks = write_tracks(features) + if genome_sequences: + contig_info['seq'] = genome_sequences.get(contig.name, "") - proksee_captions = { - "name": f"{org_name} annotated with PPanGGOLiN", - "position": "bottom-center", - "font": "sans-serif,plain,18", - "backgroundColor": "rgba(255,255,255,0.4)" - } + contigs_data_list.append(contig_info) - cgview_data = {"name": "PPanGGOLiN annotations at genome levels", - "version": "1.5.0", - 'settings':{}, - "legend":proksee_legends, - "tracks":proksee_tracks, - "sequence":{}, - 'captions':[proksee_captions], - } + return contigs_data_list - return {"cgview":cgview_data} -def write_contig(organism: Organism, genome_sequences): - """ +def write_genes(organism: Organism, disable_bar: bool = True) -> Tuple[List[Dict], Dict[str, List[Gene]]]: """ - contigs_data_list = [] - for contig in tqdm(organism.contigs, unit="contig", disable=True): - contig_info = {"name": contig.name, - "length": contig.length, - "orientation": "+", - } - if genome_sequences: - contig_info['seq'] = genome_sequences[contig.name] - contigs_data_list.append(contig_info) - return contigs_data_list + Writes gene data for a given organism, including both protein-coding genes and RNA genes. - -def write_genes(organism: Organism, disable_bar=True): + :param organism: The organism for which gene data will be written. + :param disable_bar: A flag to disable the progress bar when processing genes (default: True). + + :return: A tuple containing a list of gene data in a structured format and a dictionary mapping gene families to genes. + """ genes_data_list = [] gf2gene = defaultdict(list) + # Process protein-coding genes for gene in tqdm(organism.genes, total=organism.number_of_genes(), unit="genes", disable=disable_bar): - gf = gene.family gf2gene[gf.name].append(gene) - genes_data_list.append({"name": gene.name, - "type": "Gene", - "contig": gene.contig.name, - "start": gene.start, - "stop": gene.stop, - "strand": 1 if gene.strand == "+" else -1, - "product": gene.product, - "tags": [gene.family.named_partition, gene.family.name], - "source": "Gene", - "legend": gene.family.named_partition, - "meta": ""#annotations - }) - + genes_data_list.append({ + "name": gene.name, + "type": "Gene", + "contig": gene.contig.name, + "start": gene.start, + "stop": gene.stop, + "strand": 1 if gene.strand == "+" else -1, + "product": gene.product, + "tags": [gene.family.named_partition, gene.family.name], + "source": "Gene", + "legend": gene.family.named_partition, + "meta": "" # annotations + }) + + # Process RNA genes for gene in tqdm(organism.rna_genes, total=organism.number_of_rnas(), unit="rnas", disable=disable_bar): + genes_data_list.append({ + "name": gene.name, + "type": "Gene", + "contig": gene.contig.name, + "start": gene.start, + "stop": gene.stop, + "strand": 1 if gene.strand == "+" else -1, + "product": gene.product, + "tags": [], + "source": "Gene", + "legend": "RNA", + "meta": "" # annotations + }) - genes_data_list.append({"name": gene.name, - "type": "Gene", - "contig": gene.contig.name, - "start": gene.start, - "stop": gene.stop, - "strand": 1 if gene.strand == "+" else -1, - "product": gene.product, - "tags": [], - "source": "Gene", - "legend": "RNA", - "meta": ""#annotations - }) - return genes_data_list, gf2gene -def write_partition(organism: Organism): - partition_data_list = [] - for gene in tqdm(organism.genes, total=organism.number_of_genes(), unit="genes", disable=True): - partition_data_list.append({"name": gene.family.name, - "presence": gene.family.named_partition, - "contig": gene.contig.name, - "start": gene.start, - "stop": gene.stop, - "source": "partition", - "legend": gene.family.named_partition, - "tags": ["partition"]}) - - return partition_data_list +def write_rgp(pangenome: Pangenome, organism: Organism): + """ + Writes RGP (Region of Genomic Plasticity) data for a given organism in proksee format. + :param pangenome: The pangenome containing information about RGPs. + :param organism: The specific organism for which RGP data will be written. -def write_rgp(pangenome: Pangenome, organism: Organism): + :return: A list of RGP data in a structured format. + """ rgp_data_list = [] + + # Iterate through each RGP in the pangenome for rgp in tqdm(pangenome.regions, unit="RGP", disable=True): if rgp.organism == organism: - rgp_data_list.append({"name": rgp.name, - "contig": rgp.contig.name, - "start": rgp.start, - "stop": rgp.stop, - "legend": "RGP", - 'source':"RGP", - "tags": []}) - return rgp_data_list + # Create an entry for the RGP in the data list + rgp_data_list.append({ + "name": rgp.name, + "contig": rgp.contig.name, + "start": rgp.start, + "stop": rgp.stop, + "legend": "RGP", + "source": "RGP", + "tags": [] + }) + return rgp_data_list -def write_spots(pangenome: Pangenome, organism: Organism, gf2genes: Dict[str, List[Gene]]): - spots_data_list = [] - for spot in tqdm(pangenome.spots, unit="Spot", disable=True): - spot: Spot - spot_orgs = set() - - for gf in spot.families: - spot_orgs |= set(gf.organisms) - - if organism in spot_orgs: - gf_intersection = set(organism.families) & set(spot.families) - completion = round(len(gf_intersection) / spot.number_of_families, 2) - for gf in gf_intersection: - for gene in gf2genes[gf.name]: - spots_data_list.append({"name": f"Spot_{spot.ID}", - "start": gene.start, - "stop": gene.stop, - "contig": gene.contig.name, - "legend": "Spot", - "source":"Spot", - "tags": [], - "meta": { - "completion": completion - }}) - return spots_data_list +def write_modules(modules: List[Module], organism: Organism, gf2genes: Dict[str, List[Gene]]): + """ + Writes module data in proksee format for a list of modules associated with a given organism. + :param modules: A list of modules for which data will be written. + :param organism: The organism to which the modules are associated. + :param gf2genes: A dictionary that maps gene families to the genes they contain. -def write_modules(modules: List[Module], organism: Organism, gf2genes: Dict[str, List[Gene]]): + :return: A list of module data in a structured format. + """ modules_data_list = [] + + # Iterate through each module and find intersecting gene families for module in modules: gf_intersection = set(organism.families) & set(module.families) + if gf_intersection: + # Calculate the completion percentage completion = round(len(gf_intersection) / len(set(module.families)), 2) + + # Create module data entries for genes within intersecting gene families for gf in gf_intersection: for gene in gf2genes[gf.name]: - modules_data_list.append({"name": f"Module_{module.ID}", - "presence": "Module", - "start": gene.start, - "stop": gene.stop, - "contig": gene.contig.name, - "legend":f"module_{module.ID}", - "source": "Module", - "tags": [], - "meta": { - "completion": completion - }}) + modules_data_list.append({ + "name": f"Module_{module.ID}", + "presence": "Module", + "start": gene.start, + "stop": gene.stop, + "contig": gene.contig.name, + "legend": f"module_{module.ID}", + "source": "Module", + "tags": [], + "meta": { + "completion": completion + } + }) + return modules_data_list def write_proksee_organism(pangenome: Pangenome, organism: Organism, output: Path, - features: List[str] = None, module_to_colors: Dict[Module,str] = None, genome_sequences= None): - - proksee_data = initiate_proksee_data(features, organism.name, module_to_colors) + features: List[str] = None, + module_to_colors: Dict[Module, str] = None, + genome_sequences: Dict[str,str] = None): + """ + Write ProkSee data for a given organism. + :param pangenome: The pangenome to which the organism belongs. + :param organism: The organism for which ProkSee data will be written. + :param output: The output directory where ProkSee data will be written. + :param features: A list of features to include in the ProkSee data, e.g., ["rgp", "modules", "all"]. + :param module_to_colors: A dictionary mapping modules to their assigned colors. + :param genome_sequences: The genome sequences for the organism. - proksee_data["cgview"]["sequence"]["contigs"] = write_contig(organism, genome_sequences) + This function writes ProkSee data for a given organism, including contig information, genes colored by partition, RGPs, + and modules. The resulting data is saved as a JSON file in the specified output directory. + """ + proksee_data = initiate_proksee_data(features, organism.name, module_to_colors) + proksee_data["cgview"]["sequence"]["contigs"] = write_contig(organism, genome_sequences) genes_features, gf2genes = write_genes(organism) proksee_data["cgview"]["features"] = genes_features - # proksee_data["cgview"]["features"] += write_partition(organism) if "rgp" in features or "all" in features: proksee_data["cgview"]["features"] += write_rgp(pangenome=pangenome, organism=organism) - # if "spots" in features or "all" in features: - # proksee_data["cgview"]["features"] += write_spots(pangenome=pangenome, organism=organism, gf2genes=gf2genes) - if "modules" in features or "all" in features: proksee_data["cgview"]["features"] += write_modules(modules=module_to_colors, organism=organism, gf2genes=gf2genes) - logging.debug(f"Write proksee for {organism.name}") + logging.debug(f"Write ProkSee for {organism.name}") with open(output.joinpath(organism.name).with_suffix(".json"), "w") as out_json: json.dump(proksee_data, out_json, indent=2) - - -# def write_proksee(pangenome: Pangenome, output: Path, features: List[str] = None, sources: List[str] = None, -# template: Path = None, organisms_list: List[str] = None, threads: int = 1, disable_bar: bool = False): -# assert features is not None -# if template is None: -# template = Path(__file__).parent.joinpath("proksee_template").with_suffix(".json") -# if organisms_list is not None: -# organisms = [organism for organism in pangenome.organisms if organism.name in organisms_list] -# else: -# organisms = pangenome.organisms -# with ThreadPoolExecutor(max_workers=threads) as executor: -# with tqdm(total=len(organisms), unit='organism', disable=disable_bar) as progress: -# futures = [] -# for organism in organisms: -# future = executor.submit(write_proksee_organism, pangenome, organism, output, -# template, features, sources) -# future.add_done_callback(lambda p: progress.update()) -# futures.append(future) - -# for future in futures: -# future.result() From 1e15c3135dd4aff92ed34a9e4147cbd0b9eb9aec Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 18 Oct 2023 18:47:25 +0200 Subject: [PATCH 156/173] reuse projection fct to parse genome paths file --- ppanggolin/formats/writeFlat.py | 45 ++++++++++++++------------- ppanggolin/projection/projection.py | 46 ++------------------------- ppanggolin/utils.py | 48 +++++++++++++++++++++++++++-- 3 files changed, 73 insertions(+), 66 deletions(-) diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 1265b6cc..186f0be3 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -26,7 +26,7 @@ from ppanggolin.genome import Organism, Gene, Contig, RNA from ppanggolin.region import Region, Spot, Module from ppanggolin.pangenome import Pangenome -from ppanggolin.utils import write_compressed_or_not, mk_outdir, restricted_float, read_compressed_or_not, extract_contig_window +from ppanggolin.utils import write_compressed_or_not, mk_outdir, restricted_float, extract_contig_window, parse_input_paths_file from ppanggolin.formats.readBinaries import check_pangenome_info from ppanggolin.formats.write_proksee import write_proksee_organism from ppanggolin.formats.writeSequences import read_genome_file @@ -633,30 +633,31 @@ def write_projections(output: Path, compress: bool = False): logging.getLogger("PPanGGOLiN").info("Done writing the projection files") -def write_proksee(output: Path, compress: bool = False, fasta = None, anno = None): +def write_proksee(output: Path, fasta: Path = None, anno: Path = None): """ - """ - # TODO improve this part by using fct created in projection to read such file + Generate ProkSee data for multiple organisms and write it to the specified output directory. - organisms_file = fasta if fasta is not None else anno - - if organisms_file: - org_dict = {} - for line in read_compressed_or_not(organisms_file): - elements = [el.strip() for el in line.split("\t")] - if len(elements) <= 1: - raise Exception(f"No tabulation separator found in given --fasta or --anno file: '{organisms_file}'") - org_dict[elements[0]] = Path(elements[1]) - if not org_dict[elements[0]].exists(): # Check tsv sanity test if it's not one it's the other - org_dict[elements[0]] = organisms_file.parent.joinpath(org_dict[elements[0]]) + :param output: The directory where the ProkSee data will be written. + :param fasta: The path to a FASTA file containing genome sequences (optional). + :param anno: The path to an annotation file (optional). + This function generates ProkSee data for multiple organisms and writes it to the specified output directory. + If genome sequences are provided in a FASTA file or annotations in a separate file, they will be used in the generation + of ProkSee data for each organism to add sequences data to proksee files. + """ + organisms_file = fasta if fasta is not None else anno + if organisms_file: + org_dict = parse_input_paths_file(organisms_file) org_to_modules = defaultdict(set) + + # Create a mapping of organisms to the modules they belong to for mod in pan.modules: for org in mod.organisms: org_to_modules[org].add(mod) - + + # Generate a color mapping for modules module_to_colors = manage_module_colors(set(pan.modules)) features = ["all"] @@ -667,11 +668,13 @@ def write_proksee(output: Path, compress: bool = False, fasta = None, anno = Non else: genome_sequences = None - org_module_to_color = {org_mod:module_to_colors[org_mod] for org_mod in org_to_modules[organism]} + # Generate a color mapping for modules specific to the organism + org_module_to_color = {org_mod: module_to_colors[org_mod] for org_mod in org_to_modules[organism]} + + # Write ProkSee data for the organism + write_proksee_organism(pan, organism, output, features=features, module_to_colors=org_module_to_color, + genome_sequences=genome_sequences) - write_proksee_organism(pan, organism, output, features=features, module_to_colors = org_module_to_color, - genome_sequences=genome_sequences) - def manage_module_colors(modules: List[Module], window_size:int=30) -> Dict[Module, str]: """ @@ -1288,7 +1291,7 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core if gff: processes.append(p.apply_async(func=write_gff, args=(output, compress))) if proksee: - processes.append(p.apply_async(func=write_proksee, args=(output, compress, fasta, anno))) + processes.append(p.apply_async(func=write_proksee, args=(output, fasta, anno))) if stats: processes.append(p.apply_async(func=write_stats, args=(output, soft_core, dup_margin, compress))) if json: diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 335f89ec..9e4cc415 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -28,7 +28,9 @@ from ppanggolin.annotate.annotate import init_contig_counter, read_anno_file, annotate_organism, local_identifiers_are_unique from ppanggolin.annotate import subparser as annotate_subparser from ppanggolin.pangenome import Pangenome -from ppanggolin.utils import detect_filetype, create_tmpdir, read_compressed_or_not, write_compressed_or_not, restricted_float, mk_outdir, get_config_args, parse_config_file, get_default_args, check_input_files +from ppanggolin.utils import detect_filetype, create_tmpdir, read_compressed_or_not, write_compressed_or_not, \ + restricted_float, mk_outdir, get_config_args, parse_config_file, get_default_args, \ + check_input_files, parse_input_paths_file from ppanggolin.align.alignOnPang import write_gene_to_gene_family, get_input_seq_to_family_with_rep,get_input_seq_to_family_with_all, project_and_write_partition from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations from ppanggolin.formats.readBinaries import check_pangenome_info @@ -370,48 +372,6 @@ def check_input_names(pangenome, input_names): raise NameError(f"{len(duplicated_names)} provided organism names already exist in the given pangenome: {' '.join(duplicated_names)}") -def parse_input_paths_file(path_list_file: Path) -> Dict[str, Dict[str, List[str]]]: - """ - Parse an input paths file to extract genome information. - - This function reads an input paths file, which is in TSV format, and extracts genome information - including file paths and putative circular contigs. - - :param path_list_file: The path to the input paths file. - :return: A dictionary where keys are genome names and values are dictionaries containing path information and - putative circular contigs. - :raises FileNotFoundError: If a specified genome file path does not exist. - :raises Exception: If there are no genomes in the provided file. - """ - logging.getLogger("PPanGGOLiN").info(f"Reading {path_list_file} to process organism files") - genome_name_to_genome_path = {} - - for line in read_compressed_or_not(path_list_file): - elements = [el.strip() for el in line.split("\t")] - genome_file_path = Path(elements[1]) - genome_name = elements[0] - putative_circular_contigs = elements[2:] - - if not genome_file_path.exists(): - # Check if the file path doesn't exist and try an alternative path. - genome_file_path_alt = path_list_file.parent.joinpath(genome_file_path) - - if not genome_file_path_alt.exists(): - raise FileNotFoundError(f"The file path '{genome_file_path}' for genome '{genome_name}' specified in '{path_list_file}' does not exist.") - else: - genome_file_path = genome_file_path_alt - - genome_name_to_genome_path[genome_name] = { - "path": genome_file_path, - "circular_contigs": putative_circular_contigs - } - - if len(genome_name_to_genome_path) == 0: - raise Exception(f"There are no genomes in the provided file: {path_list_file} ") - - return genome_name_to_genome_path - - def write_summaries(organism_2_summary: Dict[Organism, Dict[str, Any]], output_dir: Path): """ diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index 177d5162..07e6bc75 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -9,7 +9,7 @@ import argparse from io import TextIOWrapper from pathlib import Path -from typing import TextIO, Union, BinaryIO, Tuple, List, Set, Iterable +from typing import TextIO, Union, BinaryIO, Tuple, List, Set, Iterable, Dict from contextlib import contextmanager import tempfile import time @@ -992,4 +992,48 @@ def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int] start_po = max(next_po - window_size, 0) - return windows_coordinates \ No newline at end of file + return windows_coordinates + + + +def parse_input_paths_file(path_list_file: Path) -> Dict[str, Dict[str, List[str]]]: + """ + Parse an input paths file to extract genome information. + + This function reads an input paths file, which is in TSV format, and extracts genome information + including file paths and putative circular contigs. + + :param path_list_file: The path to the input paths file. + :return: A dictionary where keys are genome names and values are dictionaries containing path information and + putative circular contigs. + :raises FileNotFoundError: If a specified genome file path does not exist. + :raises Exception: If there are no genomes in the provided file. + """ + logging.getLogger("PPanGGOLiN").info(f"Reading {path_list_file} to process organism files") + genome_name_to_genome_path = {} + + for line in read_compressed_or_not(path_list_file): + elements = [el.strip() for el in line.split("\t")] + genome_file_path = Path(elements[1]) + genome_name = elements[0] + putative_circular_contigs = elements[2:] + + if not genome_file_path.exists(): + # Check if the file path doesn't exist and try an alternative path. + genome_file_path_alt = path_list_file.parent.joinpath(genome_file_path) + + if not genome_file_path_alt.exists(): + raise FileNotFoundError(f"The file path '{genome_file_path}' for genome '{genome_name}' specified in '{path_list_file}' does not exist.") + else: + genome_file_path = genome_file_path_alt + + genome_name_to_genome_path[genome_name] = { + "path": genome_file_path, + "circular_contigs": putative_circular_contigs + } + + if len(genome_name_to_genome_path) == 0: + raise Exception(f"There are no genomes in the provided file: {path_list_file} ") + + return genome_name_to_genome_path + From 6f952365a71c302035fa473d79c0b33e169f937a Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 19 Oct 2023 10:32:04 +0200 Subject: [PATCH 157/173] add proksee output to projection --- ppanggolin/formats/writeFlat.py | 9 ++++++--- ppanggolin/formats/write_proksee.py | 21 +++++++++++---------- ppanggolin/projection/projection.py | 18 +++++++++++++++--- 3 files changed, 32 insertions(+), 16 deletions(-) diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 186f0be3..d23eef98 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -671,12 +671,16 @@ def write_proksee(output: Path, fasta: Path = None, anno: Path = None): # Generate a color mapping for modules specific to the organism org_module_to_color = {org_mod: module_to_colors[org_mod] for org_mod in org_to_modules[organism]} + output_file = output.joinpath(organism.name).with_suffix(".json") + # Write ProkSee data for the organism - write_proksee_organism(pan, organism, output, features=features, module_to_colors=org_module_to_color, + write_proksee_organism(organism, output_file, features=features, module_to_colors=org_module_to_color, rgps=pan.regions, genome_sequences=genome_sequences) + + -def manage_module_colors(modules: List[Module], window_size:int=30) -> Dict[Module, str]: +def manage_module_colors(modules: List[Module], window_size:int=50) -> Dict[Module, str]: """ Manages colors for a list of modules based on gene positions and a specified window size. @@ -721,7 +725,6 @@ def manage_module_colors(modules: List[Module], window_size:int=30) -> Dict[Modu nb_colors = len(set(module_to_color_int.values())) logging.getLogger().debug(f"We have found that {nb_colors} colors were necessary to color Modules.") colors = palette(nb_colors) - module_to_color = {mod: colors[col_i] for mod, col_i in module_to_color_int.items()} return module_to_color diff --git a/ppanggolin/formats/write_proksee.py b/ppanggolin/formats/write_proksee.py index d90e175d..9df61e6a 100644 --- a/ppanggolin/formats/write_proksee.py +++ b/ppanggolin/formats/write_proksee.py @@ -14,7 +14,7 @@ # local libraries from ppanggolin.genome import Organism, Gene -from ppanggolin.region import Module +from ppanggolin.region import Module, Region from ppanggolin.pangenome import Pangenome @@ -121,7 +121,7 @@ def initiate_proksee_data(features: List[str], org_name: str, module_to_color: D } cgview_data = { - "name": "PPanGGOLiN annotations at genome levels", + "name": "PPanGGOLiN annotation at genome level", "version": "1.5.0", 'settings': {}, "legend": proksee_legends, @@ -210,7 +210,7 @@ def write_genes(organism: Organism, disable_bar: bool = True) -> Tuple[List[Dict return genes_data_list, gf2gene -def write_rgp(pangenome: Pangenome, organism: Organism): +def write_rgp(rgps: Pangenome, organism: Organism): """ Writes RGP (Region of Genomic Plasticity) data for a given organism in proksee format. @@ -222,7 +222,7 @@ def write_rgp(pangenome: Pangenome, organism: Organism): rgp_data_list = [] # Iterate through each RGP in the pangenome - for rgp in tqdm(pangenome.regions, unit="RGP", disable=True): + for rgp in tqdm(rgps, unit="RGP", disable=True): if rgp.organism == organism: # Create an entry for the RGP in the data list rgp_data_list.append({ @@ -278,22 +278,23 @@ def write_modules(modules: List[Module], organism: Organism, gf2genes: Dict[str, return modules_data_list -def write_proksee_organism(pangenome: Pangenome, organism: Organism, output: Path, +def write_proksee_organism(organism: Organism, output_file: Path, features: List[str] = None, module_to_colors: Dict[Module, str] = None, + rgps:List[Region] = None, genome_sequences: Dict[str,str] = None): """ Write ProkSee data for a given organism. - :param pangenome: The pangenome to which the organism belongs. :param organism: The organism for which ProkSee data will be written. - :param output: The output directory where ProkSee data will be written. + :param output_file: The output file where ProkSee data will be written. :param features: A list of features to include in the ProkSee data, e.g., ["rgp", "modules", "all"]. :param module_to_colors: A dictionary mapping modules to their assigned colors. + :patram rgps: list of RGPs that belong to the organisms :param genome_sequences: The genome sequences for the organism. This function writes ProkSee data for a given organism, including contig information, genes colored by partition, RGPs, - and modules. The resulting data is saved as a JSON file in the specified output directory. + and modules. The resulting data is saved as a JSON file in the specified output file. """ proksee_data = initiate_proksee_data(features, organism.name, module_to_colors) @@ -304,11 +305,11 @@ def write_proksee_organism(pangenome: Pangenome, organism: Organism, output: Pat proksee_data["cgview"]["features"] = genes_features if "rgp" in features or "all" in features: - proksee_data["cgview"]["features"] += write_rgp(pangenome=pangenome, organism=organism) + proksee_data["cgview"]["features"] += write_rgp(rgps, organism=organism) if "modules" in features or "all" in features: proksee_data["cgview"]["features"] += write_modules(modules=module_to_colors, organism=organism, gf2genes=gf2genes) logging.debug(f"Write ProkSee for {organism.name}") - with open(output.joinpath(organism.name).with_suffix(".json"), "w") as out_json: + with open(output_file, "w") as out_json: json.dump(proksee_data, out_json, indent=2) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 9e4cc415..6bb5dfcb 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -39,7 +39,7 @@ from ppanggolin.genome import Organism from ppanggolin.geneFamily import GeneFamily from ppanggolin.region import Region, Spot, Module -from ppanggolin.formats.writeFlat import summarize_spots +from ppanggolin.formats.writeFlat import summarize_spots, write_proksee_organism, manage_module_colors class NewSpot(Spot): @@ -105,11 +105,11 @@ def launch(args: argparse.Namespace): need_rgp=predict_rgp, need_modules=project_modules, need_gene_sequences=False, need_spots=project_spots) - print("number_of_organisms", pangenome.number_of_organisms) logging.getLogger('PPanGGOLiN').info('Retrieving parameters from the provided pangenome file.') pangenome_params = argparse.Namespace( **{step: argparse.Namespace(**k_v) for step, k_v in pangenome.parameters.items()}) + # dup margin value here is specified in argument and is used to compute completeness. # Thats mean it can be different than dup margin used in spot and RGPS. @@ -155,6 +155,7 @@ def launch(args: argparse.Namespace): organisms_with_no_fasta = {org for org, has_fasta in org_2_has_fasta.items() if not has_fasta} if args.fasta: get_gene_sequences_from_fasta_files(organisms_with_no_fasta, genome_name_to_fasta_path) + else: raise ValueError(f"You provided GFF files for {len(organisms_with_no_fasta)} (out of {len(organisms)}) " "organisms without associated sequence data, and you did not provide " @@ -209,6 +210,9 @@ def launch(args: argparse.Namespace): exact_match=pangenome_params.spot.exact_match_size) if project_modules: + # get module color for proksee + module_to_colors = manage_module_colors(set(pangenome.modules)) + input_orgs_to_modules = project_and_write_modules(pangenome, organisms, output_dir) organism_2_summary = {} @@ -221,8 +225,16 @@ def launch(args: argparse.Namespace): input_orgs_to_modules.get(organism, None), input_org_to_lonely_genes_count[organism]) - write_summaries(organism_2_summary, output_dir) + org_module_to_color = {org_mod: module_to_colors[org_mod] for org_mod in input_orgs_to_modules.get(organism, [])} + output_file = output_dir / organism.name / f"{organism.name}_proksee.json" + + write_proksee_organism(organism, output_file, features='all', module_to_colors=org_module_to_color, + rgps=input_org_2_rgps.get(organism, None), + genome_sequences=None) + + write_summaries(organism_2_summary, output_dir) + def annotate_fasta_files(genome_name_to_fasta_path: Dict[str, dict], tmpdir: str, cpu: int = 1, translation_table: int = 11, kingdom: str = "bacteria", norna: bool = False, allow_overlap: bool = False, procedure: str = None, From ae013a51ebd4763cfaeb3e6f60e9f992c05ac6c4 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 19 Oct 2023 14:12:36 +0200 Subject: [PATCH 158/173] add sequence to gff file when equired --- ppanggolin/formats/writeFlat.py | 47 +++++++++++++++++++++------- ppanggolin/formats/writeSequences.py | 31 +++++++++++------- ppanggolin/projection/projection.py | 5 ++- 3 files changed, 58 insertions(+), 25 deletions(-) diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index d23eef98..f987107b 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -16,6 +16,7 @@ import os import random + import networkx as nx from plotly.express.colors import qualitative @@ -29,7 +30,7 @@ from ppanggolin.utils import write_compressed_or_not, mk_outdir, restricted_float, extract_contig_window, parse_input_paths_file from ppanggolin.formats.readBinaries import check_pangenome_info from ppanggolin.formats.write_proksee import write_proksee_organism -from ppanggolin.formats.writeSequences import read_genome_file +from ppanggolin.formats.writeSequences import read_genome_file, write_spaced_fasta # global variable to store the pangenome pan = Pangenome() # TODO change to pangenome:Pangenome = Pangenome=() ? @@ -664,7 +665,7 @@ def write_proksee(output: Path, fasta: Path = None, anno: Path = None): for organism in pan.organisms: if organisms_file: - genome_sequences = read_genome_file(org_dict, organism.name) + genome_sequences = read_genome_file(org_dict[organism.name]['path'], organism.name) else: genome_sequences = None @@ -752,19 +753,29 @@ def palette(nb_colors: int) -> List[str]: return colors -def write_gff(output: str, compress: bool = False): +def write_gff(output: str, compress: bool = False, fasta: Path = None, anno: Path = None): + """ Write the gff files for all organisms :param output: Path to output directory :param compress: Compress the file in .gz + :param fasta: The path to a FASTA file containing genome sequences (optional). + :param anno: The path to an annotation file (optional). + """ logging.getLogger().info("Writing the gff files...") + + organisms_file = fasta if fasta is not None else anno + + if organisms_file: + org_dict = parse_input_paths_file(organisms_file) + outdir = output / "gff" if not os.path.exists(outdir): os.makedirs(outdir) - if pan.parameters["annotation"]["read_annotations_from_file"]: + if pan.parameters["annotate"]["# read_annotations_from_file"]: annotation_sources = {"rRNA": "external", "tRNA": "external", "CDS":"external"} @@ -778,14 +789,19 @@ def write_gff(output: str, compress: bool = False): rgp_to_spot_id = {rgp:f"spot_{spot.ID}" for spot in pan.spots for rgp in spot.regions} for org in pan.organisms: - write_gff_file(org, contig_to_rgp, rgp_to_spot_id, outdir, compress, annotation_sources) + if organisms_file: + genome_sequences = read_genome_file(org_dict[org.name]['path'], org) + else: + genome_sequences = None + + write_gff_file(org, contig_to_rgp, rgp_to_spot_id, outdir, compress, annotation_sources, genome_sequences) logging.getLogger().info("Done writing the gff files") def write_gff_file(org: Organism, contig_to_rgp: Dict[Contig, Region], rgp_to_spotid: Dict[Region, str], outdir: str, compress: bool, - annotation_sources: Dict[str, str]): + annotation_sources: Dict[str, str], genome_sequences:Dict[str,str]): """ Write the GFF file of the provided organism. @@ -795,19 +811,22 @@ def write_gff_file(org: Organism, contig_to_rgp: Dict[Contig, Region], :param outdir: Path to the output directory where the GFF file will be written. :param compress: If True, compress the output GFF file using .gz format. :param annotation_sources: A dictionary that maps types of features to their source information. + :param genome_sequences: A dictionary mapping contig names to their DNA sequences (default: None). """ + # sort contig by their name + sorted_contigs = sorted(org.contigs, key= lambda x : x.name) with write_compressed_or_not(outdir / F"{org.name}.gff", compress) as outfile: # write gff header outfile.write('##gff-version 3\n') - for contig in org.contigs: + for contig in sorted_contigs: if contig.length is None: raise AttributeError(f'Contig {contig.name} has no length defined.') outfile.write(f'##sequence-region {contig.name} 1 {contig.length}\n') - for contig in org.contigs: + for contig in sorted_contigs: contig_elements = sorted(contig_to_rgp[contig] + list(contig.genes) + list(contig.RNAs), key=lambda x: (x.start)) for feature in contig_elements: @@ -889,11 +908,15 @@ def write_gff_file(org: Organism, contig_to_rgp: Dict[Contig, Region], line_str = '\t'.join(map(str, line)) outfile.write(line_str + "\n") + if genome_sequences: + logging.getLogger("PPanGGOLiN").debug("Writing fasta section of gff file...") + outfile.write(f"##FASTA\n") + for contig in sorted_contigs: + outfile.write(f">{contig.name}\n") + outfile.write(write_spaced_fasta(genome_sequences[contig.name], space=60)) - - def write_parts(output: Path, soft_core: float = 0.95): """ Write the list of gene families for each partition @@ -1291,10 +1314,10 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core processes.append(p.apply_async(func=write_gexf, args=(output, True, compress))) if projection: processes.append(p.apply_async(func=write_projections, args=(output, compress))) - if gff: - processes.append(p.apply_async(func=write_gff, args=(output, compress))) if proksee: processes.append(p.apply_async(func=write_proksee, args=(output, fasta, anno))) + if gff: + processes.append(p.apply_async(func=write_gff, args=(output, compress, fasta, anno))) if stats: processes.append(p.apply_async(func=write_stats, args=(output, soft_core, dup_margin, compress))) if json: diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index ed5e590a..3a33b4eb 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -17,7 +17,7 @@ from ppanggolin.genome import Gene from ppanggolin.utils import write_compressed_or_not, mk_outdir, read_compressed_or_not, restricted_float, detect_filetype from ppanggolin.formats.readBinaries import check_pangenome_info, get_gene_sequences_from_file - +from ppanggolin.genome import Organism module_regex = re.compile(r'^module_[0-9]+') poss_values = ['all', 'persistent', 'shell', 'cloud', 'rgp', 'softcore', 'core', module_regex] @@ -246,26 +246,33 @@ def read_fasta_gbk(file_path: Path) -> Dict[str, str]: return sequence_dict -def read_genome_file(file_dict: Dict[str, Path], genome_name: str) -> Dict[str, str]: +def read_genome_file(genome_file: Path, organism: Organism) -> Dict[str, str]: """ - Read the genome file associated to organism + Read the genome file associated to organism to extract sequences - :param file_dict: Dictionary given association between organism and fasta file - :param genome_name: organism name + :param genome_file: Path to a fasta file or gbff/gff file + :param genome: organism object :return: Dictionary with all sequences associated to contig """ - filetype = detect_filetype(file_dict[genome_name]) + filetype = detect_filetype(genome_file) if filetype in ["fasta", "gff"]: - return read_fasta_or_gff(file_dict[genome_name]) + contig_to_sequence = read_fasta_or_gff(genome_file) elif filetype == "gbff": - return read_fasta_gbk(file_dict[genome_name]) + contig_to_sequence = read_fasta_gbk(genome_file) else: - raise Exception(f"Unknown filetype detected: '{file_dict[genome_name]}'") + raise Exception(f"Unknown filetype detected: '{genome_file}'") + + # # check_contig_names + # if set(contig_to_sequence) != {contig.name for contig in organism.contigs}: + # raise Exception(f"Contig name inconsistency detected in organism '{organism.name}' between the " + # f"information stored in the pangenome file and the contigs found in '{genome_file}'.") + return contig_to_sequence def write_spaced_fasta(sequence: str, space: int = 60) -> str: - """Write a maximum of element per line + """ + Write a maximum of element per line :param sequence: sequence to write :param space: maximum of size for one line @@ -322,8 +329,8 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa loaded_genome = "" for region in tqdm(regions_to_write, unit="rgp", disable=disable_bar): if region.organism.name != loaded_genome: - loaded_genome = region.organism.name - genome_sequence = read_genome_file(org_dict, loaded_genome) + organism = region.organism + genome_sequence = read_genome_file(org_dict[organism.name], organism) fasta.write(f">{region.name}\n") fasta.write(write_spaced_fasta(genome_sequence[region.contig.name][region.starter.start:region.stopper.stop], 60)) logging.getLogger("PPanGGOLiN").info(f"Done writing the regions nucleotide sequences: '{outname}'") diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 6bb5dfcb..2c43ec56 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -39,7 +39,7 @@ from ppanggolin.genome import Organism from ppanggolin.geneFamily import GeneFamily from ppanggolin.region import Region, Spot, Module -from ppanggolin.formats.writeFlat import summarize_spots, write_proksee_organism, manage_module_colors +from ppanggolin.formats.writeFlat import summarize_spots, write_proksee_organism, manage_module_colors, write_gff_file class NewSpot(Spot): @@ -233,6 +233,9 @@ def launch(args: argparse.Namespace): rgps=input_org_2_rgps.get(organism, None), genome_sequences=None) + # write_gff_file(org, contig_to_rgp, rgp_to_spot_id, outdir, compress, annotation_sources) + + write_summaries(organism_2_summary, output_dir) From 7a01e2743b76981daf5901003dee85ec9a2559eb Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 19 Oct 2023 14:44:17 +0200 Subject: [PATCH 159/173] add gff output in projection cmd --- ppanggolin/formats/writeFlat.py | 6 +++++- ppanggolin/projection/projection.py | 33 ++++++++++++++++++++++++++++- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index f987107b..eda491b2 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -1405,13 +1405,17 @@ def parser_flat(parser: argparse.ArgumentParser): "on the organism") optional.add_argument("--gff", required=False, action="store_true", help="Generate a gff file for each organism containing pangenome annotations.") + optional.add_argument("--proksee", required=False, action="store_true", - help="Generate a json file for each organism containing pangenome annotations to be used to in proksee.") + help="Generate JSON map files for PROKSEE for each organism containing pangenome annotations to be used to in proksee.") + optional.add_argument("--stats", required=False, action="store_true", help="tsv files with some statistics for each organism and for each gene family") + optional.add_argument("--partitions", required=False, action="store_true", help="list of families belonging to each partition, with one file per partitions and " "one family per line") + optional.add_argument("--compress", required=False, action="store_true", help="Compress the files in .gz") optional.add_argument("--json", required=False, action="store_true", help="Writes the graph in a json file format") optional.add_argument("--regions", required=False, action="store_true", diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 2c43ec56..dc2a684f 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -218,6 +218,9 @@ def launch(args: argparse.Namespace): organism_2_summary = {} for organism in organisms: + + org_outdir = output_dir / organism.name + # summarize projection for all input organisms organism_2_summary[organism] = summarize_projection(organism, pangenome, single_copy_fams, input_org_2_rgps.get(organism, None), @@ -233,7 +236,26 @@ def launch(args: argparse.Namespace): rgps=input_org_2_rgps.get(organism, None), genome_sequences=None) - # write_gff_file(org, contig_to_rgp, rgp_to_spot_id, outdir, compress, annotation_sources) + + if genome_name_to_annot_path: # if the genome has not been annotated by PPanGGOLiN + annotation_sources = {"rRNA": "external", + "tRNA": "external", + "CDS":"external"} + else: + annotation_sources = {} + + contig_to_rgp, rgp_to_spot_id = {}, {} + + if organism in input_org_2_rgps: + contig_to_rgp = defaultdict(list) + for rgp in input_org_2_rgps[organism]: + contig_to_rgp[rgp.contig].append(rgp) + + if organism in input_org_to_spots: + rgp_to_spot_id = {rgp:f"spot_{spot.ID}" for spot in input_org_to_spots[organism] for rgp in spot.regions if rgp in input_org_2_rgps[organism] } + + + write_gff_file(organism, contig_to_rgp, rgp_to_spot_id, outdir=org_outdir, compress=False, annotation_sources=annotation_sources, genome_sequences=None) write_summaries(organism_2_summary, output_dir) @@ -1224,6 +1246,15 @@ def parser_projection(parser: argparse.ArgumentParser): optional.add_argument('--graph_formats', required=False, type=str, choices=['gexf', "graphml"], nargs="+", default=['gexf'], help="Format of the output graph.") + optional.add_argument("--gff", required=False, action="store_true", + help="Generate GFF files with projected pangenome annotations for each input organism.") + + optional.add_argument("--proksee", required=False, action="store_true", + help="Generate JSON map files for PROKSEE with projected pangenome annotations for each input organism.") + + optional.add_argument("--add_sequences", required=False, action="store_true", + help="Include input genome DNA sequences in GFF and Proksee output.") + optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") From fb4fc1309bbf4e9460651b3717a05aefce68c870 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 19 Oct 2023 16:10:03 +0200 Subject: [PATCH 160/173] add argument to output or not gff and proksee --- ppanggolin/projection/projection.py | 45 ++++++++++++++++------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index dc2a684f..5d80ad09 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -228,34 +228,39 @@ def launch(args: argparse.Namespace): input_orgs_to_modules.get(organism, None), input_org_to_lonely_genes_count[organism]) - org_module_to_color = {org_mod: module_to_colors[org_mod] for org_mod in input_orgs_to_modules.get(organism, [])} + if args.proksee: + org_module_to_color = {org_mod: module_to_colors[org_mod] for org_mod in input_orgs_to_modules.get(organism, [])} - output_file = output_dir / organism.name / f"{organism.name}_proksee.json" + output_file = output_dir / organism.name / f"{organism.name}_proksee.json" - write_proksee_organism(organism, output_file, features='all', module_to_colors=org_module_to_color, - rgps=input_org_2_rgps.get(organism, None), - genome_sequences=None) + + write_proksee_organism(organism, output_file, features='all', module_to_colors=org_module_to_color, + rgps=input_org_2_rgps.get(organism, None), + genome_sequences=None) - if genome_name_to_annot_path: # if the genome has not been annotated by PPanGGOLiN - annotation_sources = {"rRNA": "external", - "tRNA": "external", - "CDS":"external"} - else: - annotation_sources = {} + if args.gff: + if genome_name_to_annot_path: # if the genome has not been annotated by PPanGGOLiN + annotation_sources = {"rRNA": "external", + "tRNA": "external", + "CDS":"external"} + else: + annotation_sources = {} - contig_to_rgp, rgp_to_spot_id = {}, {} + contig_to_rgp, rgp_to_spot_id = {}, {} - if organism in input_org_2_rgps: - contig_to_rgp = defaultdict(list) - for rgp in input_org_2_rgps[organism]: - contig_to_rgp[rgp.contig].append(rgp) + if organism in input_org_2_rgps: + contig_to_rgp = defaultdict(list) + for rgp in input_org_2_rgps[organism]: + contig_to_rgp[rgp.contig].append(rgp) - if organism in input_org_to_spots: - rgp_to_spot_id = {rgp:f"spot_{spot.ID}" for spot in input_org_to_spots[organism] for rgp in spot.regions if rgp in input_org_2_rgps[organism] } + if organism in input_org_to_spots: + rgp_to_spot_id = {rgp:f"spot_{spot.ID}" for spot in input_org_to_spots[organism] for rgp in spot.regions if rgp in input_org_2_rgps[organism] } + + + write_gff_file(organism, contig_to_rgp, rgp_to_spot_id, outdir=org_outdir, compress=False, annotation_sources=annotation_sources, genome_sequences=None) - write_gff_file(organism, contig_to_rgp, rgp_to_spot_id, outdir=org_outdir, compress=False, annotation_sources=annotation_sources, genome_sequences=None) write_summaries(organism_2_summary, output_dir) @@ -270,7 +275,7 @@ def annotate_fasta_files(genome_name_to_fasta_path: Dict[str, dict], tmpdir: str :param genome_name_to_fasta_path: :param fasta_list: List of fasta file containing sequences that will be base of pangenome :param tmpdir: Path to temporary directory - :param cpu: number of CPU cores to use + :param cpu: number of CPU cores to use :param translation_table: Translation table (genetic code) to use. :param kingdom: Kingdom to which the prokaryota belongs to, to know which models to use for rRNA annotation. :param norna: Use to avoid annotating RNA features. From 8a138569c25c639c3cbc62d576895c19c085da54 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 19 Oct 2023 17:47:56 +0200 Subject: [PATCH 161/173] add proksee own output dir in write cmd --- ppanggolin/formats/writeFlat.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index eda491b2..110dbfb1 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -646,6 +646,11 @@ def write_proksee(output: Path, fasta: Path = None, anno: Path = None): If genome sequences are provided in a FASTA file or annotations in a separate file, they will be used in the generation of ProkSee data for each organism to add sequences data to proksee files. """ + + proksee_outdir = output / "proksee" + if not os.path.exists(proksee_outdir): + os.makedirs(proksee_outdir) + organisms_file = fasta if fasta is not None else anno if organisms_file: @@ -672,7 +677,7 @@ def write_proksee(output: Path, fasta: Path = None, anno: Path = None): # Generate a color mapping for modules specific to the organism org_module_to_color = {org_mod: module_to_colors[org_mod] for org_mod in org_to_modules[organism]} - output_file = output.joinpath(organism.name).with_suffix(".json") + output_file = proksee_outdir.joinpath(organism.name).with_suffix(".json") # Write ProkSee data for the organism write_proksee_organism(organism, output_file, features=features, module_to_colors=org_module_to_color, rgps=pan.regions, From 334ad700a5f7afb45b7610ad970ccb9c7a9324c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 20 Oct 2023 12:04:48 +0200 Subject: [PATCH 162/173] Add limit in networkx version for compatibility problem with python3.8 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 45c799ce..6396883f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ pyrodigal>=3.0.1 aragorn>=1.2.41 infernal>=1.1.4 mmseqs2>=13.45111 -networkx>=2.7 +networkx>=2.7,<=3.1 dataclasses>=0.8 scipy>=1.7.3 plotly>=4.14.3 From 0c590c634b44406d26c936e1804079af5b6b5e44 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 20 Oct 2023 13:37:58 +0200 Subject: [PATCH 163/173] fix error in parsing fasta file --- ppanggolin/formats/writeSequences.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 3a33b4eb..30797321 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -191,15 +191,16 @@ def read_fasta_or_gff(file_path: Path) -> Dict[str, str]: sequence_dict = {} seqname = "" seq = "" - z = False + in_fasta_part = False with read_compressed_or_not(file_path) as f: for line in f: if line.startswith(">"): - z = True - if z: + in_fasta_part = True + if in_fasta_part: if line.startswith('>'): if seq != "": sequence_dict[seqname] = seq + seq = "" seqname = line[1:].strip().split()[0] else: seq += line.strip() @@ -263,10 +264,10 @@ def read_genome_file(genome_file: Path, organism: Organism) -> Dict[str, str]: else: raise Exception(f"Unknown filetype detected: '{genome_file}'") - # # check_contig_names - # if set(contig_to_sequence) != {contig.name for contig in organism.contigs}: - # raise Exception(f"Contig name inconsistency detected in organism '{organism.name}' between the " - # f"information stored in the pangenome file and the contigs found in '{genome_file}'.") + # check_contig_names + if set(contig_to_sequence) != {contig.name for contig in organism.contigs}: + raise Exception(f"Contig name inconsistency detected in organism '{organism.name}' between the " + f"information stored in the pangenome file and the contigs found in '{genome_file}'.") return contig_to_sequence From a0f6132439517605f660b5278d6f857ec9d39f61 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 20 Oct 2023 13:38:45 +0200 Subject: [PATCH 164/173] add seq to proksee and gff output for proj cmd --- ppanggolin/projection/projection.py | 48 ++++++++++++++++++----------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 5d80ad09..8e55db9c 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -40,7 +40,7 @@ from ppanggolin.geneFamily import GeneFamily from ppanggolin.region import Region, Spot, Module from ppanggolin.formats.writeFlat import summarize_spots, write_proksee_organism, manage_module_colors, write_gff_file - +from ppanggolin.formats.writeSequences import read_genome_file class NewSpot(Spot): """ @@ -125,36 +125,41 @@ def launch(args: argparse.Namespace): single_copy_fams.add(fam) - genome_name_to_fasta_path, genome_name_to_annot_path = None, None + # genome_name_to_fasta_path, genome_name_to_annot_path = None, None + genome_name_to_path = None if args.input_mode == "multiple": if args.anno: - genome_name_to_annot_path = parse_input_paths_file(args.anno) + input_type = "annotation" + genome_name_to_path = parse_input_paths_file(args.anno) - if args.fasta: - genome_name_to_fasta_path = parse_input_paths_file(args.fasta) + elif args.fasta: + input_type = "fasta" + genome_name_to_path = parse_input_paths_file(args.fasta) else: # args.input_mode == "single: circular_contigs = args.circular_contigs if args.circular_contigs else [] if args.anno: - genome_name_to_annot_path = {args.organism_name: {"path": args.annot, + input_type = "annotation" + genome_name_to_path = {args.organism_name: {"path": args.annot, "circular_contigs": circular_contigs}} - if args.fasta: - genome_name_to_fasta_path = {args.organism_name: {"path": args.fasta, + elif args.fasta: + input_type = "fasta" + genome_name_to_path = {args.organism_name: {"path": args.fasta, "circular_contigs": circular_contigs}} - if genome_name_to_annot_path: - check_input_names(pangenome, genome_name_to_annot_path) + if input_type == "annotation": + check_input_names(pangenome, genome_name_to_path) - organisms, org_2_has_fasta = read_annotation_files(genome_name_to_annot_path, cpu=args.cpu, pseudo=args.use_pseudo, + organisms, org_2_has_fasta = read_annotation_files(genome_name_to_path, cpu=args.cpu, pseudo=args.use_pseudo, disable_bar=args.disable_prog_bar) if not all((has_fasta for has_fasta in org_2_has_fasta.values())): organisms_with_no_fasta = {org for org, has_fasta in org_2_has_fasta.items() if not has_fasta} if args.fasta: - get_gene_sequences_from_fasta_files(organisms_with_no_fasta, genome_name_to_fasta_path) + get_gene_sequences_from_fasta_files(organisms_with_no_fasta, genome_name_to_path) else: raise ValueError(f"You provided GFF files for {len(organisms_with_no_fasta)} (out of {len(organisms)}) " @@ -162,15 +167,15 @@ def launch(args: argparse.Namespace): "FASTA sequences using the --fasta or --single_fasta_file options. Therefore, it is impossible to project the pangenome onto the input genomes. " f"The following organisms have no associated sequence data: {', '.join(o.name for o in organisms_with_no_fasta)}") - elif genome_name_to_fasta_path: + elif input_type == "fasta": annotate_param_names = ["norna", "kingdom", "allow_overlap", "prodigal_procedure"] annotate_params = manage_annotate_param(annotate_param_names, pangenome_params.annotate, args.config) - check_input_names(pangenome, genome_name_to_fasta_path) - organisms = annotate_fasta_files(genome_name_to_fasta_path=genome_name_to_fasta_path, tmpdir=args.tmpdir, cpu=args.cpu, + check_input_names(pangenome, genome_name_to_path) + organisms = annotate_fasta_files(genome_name_to_fasta_path=genome_name_to_path, tmpdir=args.tmpdir, cpu=args.cpu, translation_table=int(pangenome_params.cluster.translation_table), norna=annotate_params.norna, kingdom=annotate_params.kingdom, allow_overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure, disable_bar=args.disable_prog_bar ) @@ -209,6 +214,8 @@ def launch(args: argparse.Namespace): set_size=pangenome_params.spot.set_size, exact_match=pangenome_params.spot.exact_match_size) + + if project_modules: # get module color for proksee module_to_colors = manage_module_colors(set(pangenome.modules)) @@ -228,6 +235,10 @@ def launch(args: argparse.Namespace): input_orgs_to_modules.get(organism, None), input_org_to_lonely_genes_count[organism]) + if (args.proksee or args.gff) and args.add_sequences: + genome_sequences = read_genome_file(genome_name_to_path[organism.name]['path'], organism) + genome_name_to_path[organism.name]['path'] + if args.proksee: org_module_to_color = {org_mod: module_to_colors[org_mod] for org_mod in input_orgs_to_modules.get(organism, [])} @@ -236,11 +247,11 @@ def launch(args: argparse.Namespace): write_proksee_organism(organism, output_file, features='all', module_to_colors=org_module_to_color, rgps=input_org_2_rgps.get(organism, None), - genome_sequences=None) + genome_sequences=genome_sequences) if args.gff: - if genome_name_to_annot_path: # if the genome has not been annotated by PPanGGOLiN + if input_type == "annotation": # if the genome has not been annotated by PPanGGOLiN annotation_sources = {"rRNA": "external", "tRNA": "external", "CDS":"external"} @@ -258,7 +269,8 @@ def launch(args: argparse.Namespace): rgp_to_spot_id = {rgp:f"spot_{spot.ID}" for spot in input_org_to_spots[organism] for rgp in spot.regions if rgp in input_org_2_rgps[organism] } - write_gff_file(organism, contig_to_rgp, rgp_to_spot_id, outdir=org_outdir, compress=False, annotation_sources=annotation_sources, genome_sequences=None) + write_gff_file(organism, contig_to_rgp, rgp_to_spot_id, outdir=org_outdir, compress=False, + annotation_sources=annotation_sources, genome_sequences=genome_sequences) From 8e447c8a8bd2201d20f4d3a08d6180fc800be63b Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 20 Oct 2023 13:39:08 +0200 Subject: [PATCH 165/173] fix bug in proksee output --- ppanggolin/formats/writeFlat.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 110dbfb1..681a7761 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -646,7 +646,7 @@ def write_proksee(output: Path, fasta: Path = None, anno: Path = None): If genome sequences are provided in a FASTA file or annotations in a separate file, they will be used in the generation of ProkSee data for each organism to add sequences data to proksee files. """ - + proksee_outdir = output / "proksee" if not os.path.exists(proksee_outdir): os.makedirs(proksee_outdir) @@ -670,7 +670,7 @@ def write_proksee(output: Path, fasta: Path = None, anno: Path = None): for organism in pan.organisms: if organisms_file: - genome_sequences = read_genome_file(org_dict[organism.name]['path'], organism.name) + genome_sequences = read_genome_file(org_dict[organism.name]['path'], organism) else: genome_sequences = None @@ -684,7 +684,7 @@ def write_proksee(output: Path, fasta: Path = None, anno: Path = None): genome_sequences=genome_sequences) - + logging.getLogger().info("Done writing the proksee files") def manage_module_colors(modules: List[Module], window_size:int=50) -> Dict[Module, str]: """ From a1fb03e8f76f8d30f1487be00e1b151fa757409a Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 20 Oct 2023 14:10:06 +0200 Subject: [PATCH 166/173] add gff and proksee in the gh action --- .github/workflows/main.yml | 8 ++++---- ppanggolin/projection/projection.py | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index e0f59001..968e9c45 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -67,8 +67,8 @@ jobs: ppanggolin spot -p stepbystep/pangenome.h5 --spot_graph --overlapping_match 2 --set_size 3 --exact_match_size 1 ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots -o stepbystep -f ppanggolin module -p stepbystep/pangenome.h5 --transitive 4 --size 3 --jaccard 0.86 --dup_margin 0.05 - ppanggolin write -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06 --gexf --light_gexf --csv --Rtab --projection --stats --partitions --compress --json --regions --spots --borders --families_tsv --cpu 1 - ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta organisms.fasta.list + ppanggolin write -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06 --gff --proksee --gexf --light_gexf --csv --Rtab --projection --stats --partitions --compress --json --regions --spots --borders --families_tsv --cpu 1 --fasta organisms.fasta.list --gff --proksee + ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta organisms.fasta.list --gff --proksee ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --info_modules --no_print_info -f --log metrics.log cd - @@ -134,10 +134,10 @@ jobs: run: | cd testingDataset head organisms.gbff.list | sed 's/^/input_org_/g' > organisms.gbff.head.list - ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_lisy_of_gbff --anno organisms.gbff.head.list + ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_list_of_gbff --anno organisms.gbff.head.list --gff --proksee ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_single_fasta \ --organism_name chlam_A --fasta FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \ - --spot_graph --graph_formats graphml --fast --keep_tmp -f + --spot_graph --graph_formats graphml --fast --keep_tmp -f --add_sequences --gff --proksee diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 8e55db9c..c3ff7133 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -238,6 +238,8 @@ def launch(args: argparse.Namespace): if (args.proksee or args.gff) and args.add_sequences: genome_sequences = read_genome_file(genome_name_to_path[organism.name]['path'], organism) genome_name_to_path[organism.name]['path'] + else: + genome_sequences = None if args.proksee: org_module_to_color = {org_mod: module_to_colors[org_mod] for org_mod in input_orgs_to_modules.get(organism, [])} From 068bbba2d4d3e3ba46b156a14315343b447a6f47 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 20 Oct 2023 14:45:47 +0200 Subject: [PATCH 167/173] fix error in gh action --- .github/workflows/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 968e9c45..9297851c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -67,8 +67,8 @@ jobs: ppanggolin spot -p stepbystep/pangenome.h5 --spot_graph --overlapping_match 2 --set_size 3 --exact_match_size 1 ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots -o stepbystep -f ppanggolin module -p stepbystep/pangenome.h5 --transitive 4 --size 3 --jaccard 0.86 --dup_margin 0.05 - ppanggolin write -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06 --gff --proksee --gexf --light_gexf --csv --Rtab --projection --stats --partitions --compress --json --regions --spots --borders --families_tsv --cpu 1 --fasta organisms.fasta.list --gff --proksee - ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta organisms.fasta.list --gff --proksee + ppanggolin write -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06 --gexf --light_gexf --csv --Rtab --projection --stats --partitions --compress --json --regions --spots --borders --families_tsv --cpu 1 --fasta organisms.fasta.list --gff --proksee + ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta organisms.fasta.list ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --info_modules --no_print_info -f --log metrics.log cd - From 4defeca232712526a7f6f19f803fae24523602d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 23 Oct 2023 11:04:46 +0200 Subject: [PATCH 168/173] resolve some review comment --- ppanggolin/annotate/annotate.py | 7 ++++--- ppanggolin/formats/writeSequences.py | 3 +-- ppanggolin/genome.py | 20 ++++++-------------- ppanggolin/projection/projection.py | 2 +- ppanggolin/region.py | 4 ++-- 5 files changed, 14 insertions(+), 22 deletions(-) diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 871c9e76..9f25fe4e 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -239,7 +239,8 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li sequence += line[10:].replace(" ", "").strip().upper() line = lines.pop() - contig.add_contig_length(len(sequence)) + if contig.length != len(sequence): + raise ValueError("The contig lenght defined is different than the sequence length") # get each gene's sequence. for gene in contig.genes: @@ -371,8 +372,8 @@ def get_id_attribute(attributes_dict: dict) -> str: if has_fasta and fasta_string != "": contig_sequences = read_fasta(org, fasta_string.split('\n')) # _ is total contig length for contig in org.contigs: - - contig.add_contig_length(len(contig_sequences[contig.name])) + if contig.length != len(contig_sequences[contig.name]): + raise ValueError("The contig lenght defined is different than the sequence length") for gene in contig.genes: gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene)) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 30797321..acbface1 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -14,10 +14,9 @@ # local libraries from ppanggolin.pangenome import Pangenome from ppanggolin.geneFamily import GeneFamily -from ppanggolin.genome import Gene +from ppanggolin.genome import Gene, Organism from ppanggolin.utils import write_compressed_or_not, mk_outdir, read_compressed_or_not, restricted_float, detect_filetype from ppanggolin.formats.readBinaries import check_pangenome_info, get_gene_sequences_from_file -from ppanggolin.genome import Organism module_regex = re.compile(r'^module_[0-9]+') poss_values = ['all', 'persistent', 'shell', 'cloud', 'rgp', 'softcore', 'core', module_regex] diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index fd5d1243..743a4eb4 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -359,7 +359,12 @@ def length(self, contig_len: int): raise TypeError("Contig length is expected to be an integer") if contig_len < 0: raise ValueError("Contig length must be positive") - self._length = contig_len + + if self.length is None: + self._length = contig_len + elif self.length != contig_length: + raise ValueError('Attempting to define a contig length different from the previously defined value.') + def __len__(self): return self.length @@ -525,19 +530,6 @@ def number_of_rnas(self) -> int: """ return len(self._rna_getter) - def add_contig_length(self, contig_length: int): - """ - Add contig length to Contig object. - - :param contig_length: Length of the contig. - :raises ValueError: If trying to define a contig length different than previously defined. - """ - if self.length is None: - self.length = contig_length - - elif self.length != contig_length: - raise ValueError('Attempting to define a contig length different from the previously defined value.') - class Organism(MetaFeatures): """ diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index c3ff7133..3bd6a42a 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -1049,7 +1049,7 @@ def predict_spot_in_one_organism( input_org_spots = {spot for spots in input_rgp_to_spots.values() for spot in spots } - new_spots = {spot for spot in input_org_spots if type(spot) == NewSpot} + new_spots = {spot for spot in input_org_spots if isinstance(spot, NewSpot)} logging.getLogger('PPanGGOLiN').debug( diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 58700a1e..da434858 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -245,7 +245,7 @@ def contig(self) -> Contig: return self.starter.contig @property - def start(self) -> Contig: + def start(self) -> int: """ Get the starter start link to RGP @@ -254,7 +254,7 @@ def start(self) -> Contig: return self.starter.start @property - def stop(self) -> Contig: + def stop(self) -> int: """ Get the stopper stop link to RGP From 95171ee686e2c7dad2c84c8cab8e6b2ed4c47bba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 23 Oct 2023 11:45:09 +0200 Subject: [PATCH 169/173] Fix variable name --- ppanggolin/genome.py | 2 +- ppanggolin/region.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 743a4eb4..f1ebb682 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -362,7 +362,7 @@ def length(self, contig_len: int): if self.length is None: self._length = contig_len - elif self.length != contig_length: + elif self.length != contig_len: raise ValueError('Attempting to define a contig length different from the previously defined value.') diff --git a/ppanggolin/region.py b/ppanggolin/region.py index da434858..a2abcc3c 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -746,7 +746,7 @@ def organisms(self) -> Generator[Organism, None, None]: organisms = set() for fam in self.families: organisms |= set(fam.organisms) - return organisms + yield from organisms def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): From 5eb755a55c1ae5f0fe65d607bec945dbe7dc2cad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 23 Oct 2023 17:14:03 +0200 Subject: [PATCH 170/173] Fix bug in get contig length in gff files --- ppanggolin/annotate/annotate.py | 3 +-- ppanggolin/annotate/synta.py | 1 - ppanggolin/genome.py | 3 ++- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 9f25fe4e..3a2b7680 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -241,7 +241,6 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li if contig.length != len(sequence): raise ValueError("The contig lenght defined is different than the sequence length") - # get each gene's sequence. for gene in contig.genes: gene.add_sequence(get_dna_sequence(sequence, gene)) @@ -319,7 +318,7 @@ def get_id_attribute(attributes_dict: dict) -> str: True if fields[1] in circular_contigs else False) contig_counter.value += 1 org.add(contig) - contig.length = int(fields[-1]) - int(fields[3]) + 1 + contig.length = int(fields[-1]) - int(fields[2]) + 1 continue elif line.startswith('#'): # comment lines to be ignores by parsers diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 8dbe7f2d..18e512ab 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -169,7 +169,6 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str, :return: Dictionnary with contig_name as keys and contig sequence in values """ global contig_counter - try: contigs = {} contig_seq = "" diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index f1ebb682..d74e8779 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -360,9 +360,10 @@ def length(self, contig_len: int): if contig_len < 0: raise ValueError("Contig length must be positive") - if self.length is None: + if self._length is None: self._length = contig_len elif self.length != contig_len: + logging.getLogger("PPanGGOLiN").debug(f"Known contig length = {self.length}, new length = {contig_len}") raise ValueError('Attempting to define a contig length different from the previously defined value.') From accd2c19c06835d6e1b5ace0369fb9c67111366a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 23 Oct 2023 17:45:24 +0200 Subject: [PATCH 171/173] resolve PR review --- ppanggolin/formats/writeFlat.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 681a7761..97421c5d 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -647,9 +647,8 @@ def write_proksee(output: Path, fasta: Path = None, anno: Path = None): of ProkSee data for each organism to add sequences data to proksee files. """ - proksee_outdir = output / "proksee" - if not os.path.exists(proksee_outdir): - os.makedirs(proksee_outdir) + proksee_outdir = output / "proksee" + mk_outdir(proksee_outdir, True) organisms_file = fasta if fasta is not None else anno @@ -664,7 +663,7 @@ def write_proksee(output: Path, fasta: Path = None, anno: Path = None): org_to_modules[org].add(mod) # Generate a color mapping for modules - module_to_colors = manage_module_colors(set(pan.modules)) + module_to_colors = manage_module_colors(list(pan.modules)) features = ["all"] @@ -677,7 +676,7 @@ def write_proksee(output: Path, fasta: Path = None, anno: Path = None): # Generate a color mapping for modules specific to the organism org_module_to_color = {org_mod: module_to_colors[org_mod] for org_mod in org_to_modules[organism]} - output_file = proksee_outdir.joinpath(organism.name).with_suffix(".json") + output_file = proksee_outdir / f"{organism.name}.json" # Write ProkSee data for the organism write_proksee_organism(organism, output_file, features=features, module_to_colors=org_module_to_color, rgps=pan.regions, @@ -777,8 +776,7 @@ def write_gff(output: str, compress: bool = False, fasta: Path = None, anno: Pat org_dict = parse_input_paths_file(organisms_file) outdir = output / "gff" - if not os.path.exists(outdir): - os.makedirs(outdir) + mk_outdir(outdir, True) if pan.parameters["annotate"]["# read_annotations_from_file"]: annotation_sources = {"rRNA": "external", From 61dacfeb42dc6a2df718b8c09e113fd73b8a5c60 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 23 Oct 2023 16:07:06 +0200 Subject: [PATCH 172/173] refac launch of projection --- ppanggolin/projection/projection.py | 300 ++++++++++++++++++---------- 1 file changed, 200 insertions(+), 100 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 3bd6a42a..0ad4cd2a 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -51,23 +51,29 @@ class NewSpot(Spot): def __str__(self): return f'new_spot_{str(self.ID)}' -def launch(args: argparse.Namespace): +def check_pangenome_for_projection(pangenome: Pangenome, fast_aln:bool): """ - Command launcher + Check the status of a pangenome and determine whether projection is possible. - :param args: All arguments provide by user - """ + :param pangenome: The pangenome to be checked. + :param fast_aln: Whether to use the fast alignment option for gene projection. - output_dir = Path(args.output) - mk_outdir(output_dir, args.force) + This function checks various attributes of a pangenome to determine whether it is suitable for projecting + features into a provided genome. + + Returns: + A tuple indicating whether RGP prediction, spot projection, and module projection + are possible (True) or not (False) based on the pangenome's status. + + Raises: + NameError: If the pangenome has not been partitioned. + Exception: If the pangenome lacks gene sequences or gene family sequences, and fast alignment is not enabled. + """ - # For the moment these elements of the pangenome are predicted by default project_modules = True predict_rgp = True project_spots = True - pangenome = Pangenome() - pangenome.add_file(args.pangenome) if pangenome.status["partitioned"] not in ["Computed", "Loaded", "inFile"]: raise NameError(f"The provided pangenome has not been partitioned. " @@ -91,7 +97,7 @@ def launch(args: argparse.Namespace): project_modules = False - if pangenome.status["geneSequences"] not in ["Loaded", "Computed", "inFile"] and not args.fast: + if pangenome.status["geneSequences"] not in ["Loaded", "Computed", "inFile"] and not fast_aln: raise Exception("The provided pangenome has no gene sequences. " "Projection is still possible with the --fast option to use representative " "sequences rather than all genes to annotate input genes.") @@ -100,70 +106,52 @@ def launch(args: argparse.Namespace): raise Exception("The provided pangenome has no gene families sequences. " "This is not possible to annotate an input organism to this pangenome.") + return predict_rgp, project_spots, project_modules - check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar, - need_rgp=predict_rgp, need_modules=project_modules, need_gene_sequences=False, - need_spots=project_spots) - - logging.getLogger('PPanGGOLiN').info('Retrieving parameters from the provided pangenome file.') - pangenome_params = argparse.Namespace( - **{step: argparse.Namespace(**k_v) for step, k_v in pangenome.parameters.items()}) - - # dup margin value here is specified in argument and is used to compute completeness. - # Thats mean it can be different than dup margin used in spot and RGPS. - - # TODO make this single_copy_fams a method of class Pangenome that should be used in write --stats - single_copy_fams = set() - - for fam in pangenome.gene_families: - if fam.named_partition == "persistent": - dup = len([genes for genes in fam.get_org_dict().values() if - len([gene for gene in genes if not gene.is_fragment]) > 1]) - - if (dup / fam.number_of_organisms) < args.dup_margin: - single_copy_fams.add(fam) - - - # genome_name_to_fasta_path, genome_name_to_annot_path = None, None +def manage_input_genomes_annotation(pangenome, input_mode, anno, fasta, + organism_name, circular_contigs, pangenome_params, + cpu, use_pseudo, disable_bar, tmpdir, config): + """ + """ genome_name_to_path = None - if args.input_mode == "multiple": - if args.anno: + if input_mode == "multiple": + if anno: input_type = "annotation" - genome_name_to_path = parse_input_paths_file(args.anno) + genome_name_to_path = parse_input_paths_file(anno) - elif args.fasta: + elif fasta: input_type = "fasta" - genome_name_to_path = parse_input_paths_file(args.fasta) + genome_name_to_path = parse_input_paths_file(fasta) else: # args.input_mode == "single: - circular_contigs = args.circular_contigs if args.circular_contigs else [] - if args.anno: + circular_contigs = circular_contigs if circular_contigs else [] + if anno: input_type = "annotation" - genome_name_to_path = {args.organism_name: {"path": args.annot, + genome_name_to_path = {organism_name: {"path": anno, "circular_contigs": circular_contigs}} - elif args.fasta: + elif fasta: input_type = "fasta" - genome_name_to_path = {args.organism_name: {"path": args.fasta, + genome_name_to_path = {organism_name: {"path": fasta, "circular_contigs": circular_contigs}} - + if input_type == "annotation": check_input_names(pangenome, genome_name_to_path) - organisms, org_2_has_fasta = read_annotation_files(genome_name_to_path, cpu=args.cpu, pseudo=args.use_pseudo, - disable_bar=args.disable_prog_bar) + organisms, org_2_has_fasta = read_annotation_files(genome_name_to_path, cpu=cpu, pseudo=use_pseudo, + disable_bar=disable_bar) if not all((has_fasta for has_fasta in org_2_has_fasta.values())): organisms_with_no_fasta = {org for org, has_fasta in org_2_has_fasta.items() if not has_fasta} - if args.fasta: + if fasta: get_gene_sequences_from_fasta_files(organisms_with_no_fasta, genome_name_to_path) else: raise ValueError(f"You provided GFF files for {len(organisms_with_no_fasta)} (out of {len(organisms)}) " - "organisms without associated sequence data, and you did not provide " + "organisms without associated sequence data, and you did not provide " "FASTA sequences using the --fasta or --single_fasta_file options. Therefore, it is impossible to project the pangenome onto the input genomes. " f"The following organisms have no associated sequence data: {', '.join(o.name for o in organisms_with_no_fasta)}") @@ -171,56 +159,51 @@ def launch(args: argparse.Namespace): annotate_param_names = ["norna", "kingdom", "allow_overlap", "prodigal_procedure"] - annotate_params = manage_annotate_param(annotate_param_names, pangenome_params.annotate, args.config) + annotate_params = manage_annotate_param(annotate_param_names, pangenome_params.annotate, config) check_input_names(pangenome, genome_name_to_path) - organisms = annotate_fasta_files(genome_name_to_fasta_path=genome_name_to_path, tmpdir=args.tmpdir, cpu=args.cpu, - translation_table=int(pangenome_params.cluster.translation_table), norna=annotate_params.norna, kingdom=annotate_params.kingdom, - allow_overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure, disable_bar=args.disable_prog_bar ) - - - input_org_to_lonely_genes_count = annotate_input_genes_with_pangenome_families(pangenome, input_organisms=organisms, - output=output_dir, cpu=args.cpu, use_representatives=args.fast, - no_defrag=args.no_defrag, identity=args.identity, - coverage=args.coverage, tmpdir=args.tmpdir, - translation_table=int(pangenome_params.cluster.translation_table), - keep_tmp=args.keep_tmp, - disable_bar=args.disable_prog_bar) - - - input_org_2_rgps, input_org_to_spots, input_orgs_to_modules = {}, {}, {} - - if predict_rgp: - logging.getLogger('PPanGGOLiN').info('Detecting RGPs in input genomes.') - - multigenics = pangenome.get_multigenics(pangenome_params.rgp.dup_margin) - - input_org_2_rgps = predict_RGP(pangenome, organisms, persistent_penalty=pangenome_params.rgp.persistent_penalty, variable_gain=pangenome_params.rgp.variable_gain, - min_length=pangenome_params.rgp.min_length, min_score=pangenome_params.rgp.min_score, multigenics=multigenics, output_dir=output_dir, - disable_bar=args.disable_prog_bar) - - - if project_spots: - logging.getLogger('PPanGGOLiN').info('Predicting spot of insertion in input genomes.') - input_org_to_spots = predict_spots_in_input_organisms(initial_spots=list(pangenome.spots), - initial_regions=pangenome.regions, - input_org_2_rgps=input_org_2_rgps, - multigenics=multigenics, - output=output_dir, - write_graph_flag=args.spot_graph, - graph_formats=args.graph_formats, - overlapping_match=pangenome_params.spot.overlapping_match, - set_size=pangenome_params.spot.set_size, - exact_match=pangenome_params.spot.exact_match_size) - - + organisms = annotate_fasta_files(genome_name_to_fasta_path=genome_name_to_path, tmpdir=tmpdir, cpu=cpu, + translation_table=int(pangenome_params.cluster.translation_table), norna=annotate_params.norna, kingdom=annotate_params.kingdom, + allow_overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure, disable_bar=disable_bar) + return organisms, genome_name_to_path, input_type + + +def write_projection_results(pangenome:Pangenome, organisms:Set[Organism], input_org_2_rgps:Dict[Organism, Set[Region]], + input_org_to_spots:Dict[Organism, Set[Spot]], + input_orgs_to_modules:Dict[Organism, Set[Module]] , + input_org_to_lonely_genes_count:Dict[Organism, int], + write_proksee:bool, write_gff:bool, add_sequences:bool, + genome_name_to_path:Dict[str,dict], input_type:str, + output_dir:Path, dup_margin:float, ): + """ + Write the results of the projection of pangneome onto input genomes. + + :param pangenome: The pangenome onto which the projection is performed. + :param organisms: A set of input organisms for projection. + :param input_org_2_rgps: A dictionary mapping input organisms to sets of regions of genomic plasticity (RGPs). + :param input_org_to_spots: A dictionary mapping input organisms to sets of spots. + :param input_orgs_to_modules: A dictionary mapping input organisms to sets of modules. + :param input_org_to_lonely_genes_count: A dictionary mapping input organisms to the count of lonely genes. + :param write_proksee: Whether to write ProkSee JSON files. + :param write_gff: Whether to write GFF files. + :param add_sequences: Whether to add sequences to the output files. + :param genome_name_to_path: A dictionary mapping genome names to file paths. + :param input_type: The type of input data (e.g., "annotation"). + :param output_dir: The directory where the output files will be written. + :param dup_margin: The duplication margin used to compute completeness. + + Note: + - If `write_proksee` is True and input organisms have modules, module colors for ProkSee are obtained. + - The function calls other functions such as `summarize_projection`, `read_genome_file`, `write_proksee_organism`, + `write_gff_file`, and `write_summaries` to generate various output files and summaries. + """ - if project_modules: + if write_proksee and input_orgs_to_modules: # get module color for proksee module_to_colors = manage_module_colors(set(pangenome.modules)) - - input_orgs_to_modules = project_and_write_modules(pangenome, organisms, output_dir) + + single_copy_families = get_single_copy_families(pangenome, dup_margin) organism_2_summary = {} @@ -229,19 +212,19 @@ def launch(args: argparse.Namespace): org_outdir = output_dir / organism.name # summarize projection for all input organisms - organism_2_summary[organism] = summarize_projection(organism, pangenome, single_copy_fams, + organism_2_summary[organism] = summarize_projection(organism, pangenome, single_copy_families, input_org_2_rgps.get(organism, None), input_org_to_spots.get(organism, None), input_orgs_to_modules.get(organism, None), input_org_to_lonely_genes_count[organism]) - if (args.proksee or args.gff) and args.add_sequences: + if (write_proksee or write_gff) and add_sequences: genome_sequences = read_genome_file(genome_name_to_path[organism.name]['path'], organism) genome_name_to_path[organism.name]['path'] else: genome_sequences = None - if args.proksee: + if write_proksee: org_module_to_color = {org_mod: module_to_colors[org_mod] for org_mod in input_orgs_to_modules.get(organism, [])} output_file = output_dir / organism.name / f"{organism.name}_proksee.json" @@ -252,7 +235,7 @@ def launch(args: argparse.Namespace): genome_sequences=genome_sequences) - if args.gff: + if write_gff: if input_type == "annotation": # if the genome has not been annotated by PPanGGOLiN annotation_sources = {"rRNA": "external", "tRNA": "external", @@ -277,7 +260,7 @@ def launch(args: argparse.Namespace): - write_summaries(organism_2_summary, output_dir) + write_summaries(organism_2_summary, output_dir) def annotate_fasta_files(genome_name_to_fasta_path: Dict[str, dict], tmpdir: str, cpu: int = 1, translation_table: int = 11, @@ -463,15 +446,47 @@ def write_summaries(organism_2_summary: Dict[Organism, Dict[str, Any]], output_d df_summary.to_csv(output_dir / "summary_projection.tsv", sep='\t', index=False) +def get_single_copy_families(pangenome: Pangenome, dup_margin:float): + """ + Get single copy families + + :param pangenome: The pangenome onto which the projection is performed. + :param dup_margin: The duplication margin used to compute single copy families. + + """ + + # TODO make this single_copy_fams a method of class Pangenome that should be used in write --stats + single_copy_families = set() + + for fam in pangenome.gene_families: + if fam.named_partition == "persistent": + dup = len([genes for genes in fam.get_org_dict().values() if + len([gene for gene in genes if not gene.is_fragment]) > 1]) + + if (dup / fam.number_of_organisms) < dup_margin: + single_copy_families.add(fam) + + return single_copy_families + def summarize_projection(input_organism:Organism, pangenome:Pangenome, single_copy_families:Set, input_org_rgps:Region, input_org_spots:Spot, input_org_modules:Module, singleton_gene_count:int): """ + Summarize the projection of an input organism onto a pangenome. - :param singleton_gene_count: Number of genes that do not cluster with any of the gene families of the pangenome. + :param input_organism: The input organism for projection. + :param input_org_rgps: The regions of genomic plasticity (RGPs) in the input organism. + :param input_org_spots: The spots in the input organism. + :param input_org_modules: The modules in the input organism. + :param singleton_gene_count: Number of genes that do not cluster with any gene families in the pangenome. + + Returns: + A dictionary containing summary information about the projection, including organism details, + gene and family counts, completeness, and counts of RGPs, spots, new spots, and modules. """ + partition_to_gene = defaultdict(set) contigs_count = 0 for contig in input_organism.contigs: @@ -485,7 +500,7 @@ def summarize_projection(input_organism:Organism, pangenome:Pangenome, single_c completeness = "NA" - single_copy_markers_count = len(set(input_organism.families) & single_copy_families ) + single_copy_markers_count = len(set(input_organism.families) & single_copy_families) if len(single_copy_families) > 0: completeness = round((single_copy_markers_count / len(single_copy_families)) * 100, 2) @@ -1184,6 +1199,91 @@ def check_projection_arguments(args: argparse.Namespace, parser: argparse.Argume return input_mode + + +def launch(args: argparse.Namespace): + """ + Command launcher + + :param args: All arguments provide by user + """ + + output_dir = Path(args.output) + mk_outdir(output_dir, args.force) + + # For the moment these elements of the pangenome are predicted by default + + pangenome = Pangenome() + pangenome.add_file(args.pangenome) + + predict_rgp, project_spots, project_modules = check_pangenome_for_projection(pangenome, args.fast) + + check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar, + need_rgp=predict_rgp, need_modules=project_modules, need_gene_sequences=False, + need_spots=project_spots) + + logging.getLogger('PPanGGOLiN').info('Retrieving parameters from the provided pangenome file.') + pangenome_params = argparse.Namespace( + **{step: argparse.Namespace(**k_v) for step, k_v in pangenome.parameters.items()}) + + organisms, genome_name_to_path, input_type = manage_input_genomes_annotation(pangenome=pangenome, + input_mode=args.input_mode, + anno=args.anno, fasta=args.fasta, + organism_name=args.organism_name, + circular_contigs=args.circular_contigs, + pangenome_params=pangenome_params, + cpu=args.cpu, use_pseudo=args.use_pseudo, + disable_bar=args.disable_prog_bar, + tmpdir= args.tmpdir, config=args.config) + + + + input_org_to_lonely_genes_count = annotate_input_genes_with_pangenome_families(pangenome, input_organisms=organisms, + output=output_dir, cpu=args.cpu, use_representatives=args.fast, + no_defrag=args.no_defrag, identity=args.identity, + coverage=args.coverage, tmpdir=args.tmpdir, + translation_table=int(pangenome_params.cluster.translation_table), + keep_tmp=args.keep_tmp, + disable_bar=args.disable_prog_bar) + + + input_org_2_rgps, input_org_to_spots, input_orgs_to_modules = {}, {}, {} + + if predict_rgp: + + logging.getLogger('PPanGGOLiN').info('Detecting RGPs in input genomes.') + + multigenics = pangenome.get_multigenics(pangenome_params.rgp.dup_margin) + + input_org_2_rgps = predict_RGP(pangenome, organisms, persistent_penalty=pangenome_params.rgp.persistent_penalty, variable_gain=pangenome_params.rgp.variable_gain, + min_length=pangenome_params.rgp.min_length, min_score=pangenome_params.rgp.min_score, multigenics=multigenics, output_dir=output_dir, + disable_bar=args.disable_prog_bar) + + if project_spots: + logging.getLogger('PPanGGOLiN').info('Predicting spot of insertion in input genomes.') + input_org_to_spots = predict_spots_in_input_organisms(initial_spots=list(pangenome.spots), + initial_regions=pangenome.regions, + input_org_2_rgps=input_org_2_rgps, + multigenics=multigenics, + output=output_dir, + write_graph_flag=args.spot_graph, + graph_formats=args.graph_formats, + overlapping_match=pangenome_params.spot.overlapping_match, + set_size=pangenome_params.spot.set_size, + exact_match=pangenome_params.spot.exact_match_size) + + if project_modules: + input_orgs_to_modules = project_and_write_modules(pangenome, organisms, output_dir) + + write_projection_results(pangenome, organisms, input_org_2_rgps, + input_org_to_spots, + input_orgs_to_modules, + input_org_to_lonely_genes_count, + write_proksee=args.proksee, write_gff=args.gff, add_sequences=args.add_sequences, + genome_name_to_path=genome_name_to_path, input_type=input_type, + output_dir=output_dir, dup_margin=args.dup_margin) + + def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: """ Subparser to launch PPanGGOLiN in Command line From b0b7ef1ad8e011a9327c43f7acef688f3ba22642 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 24 Oct 2023 11:19:08 +0200 Subject: [PATCH 173/173] Refactoring --- ppanggolin/align/alignOnPang.py | 21 ++++---- ppanggolin/cluster/cluster.py | 2 - ppanggolin/figures/draw_spot.py | 4 +- ppanggolin/figures/tile_plot.py | 2 +- ppanggolin/formats/writeFlat.py | 6 +-- ppanggolin/formats/writeMSA.py | 2 - ppanggolin/formats/writeMetadata.py | 2 +- ppanggolin/formats/writeSequences.py | 2 +- ppanggolin/geneFamily.py | 6 +-- ppanggolin/graph/makeGraph.py | 2 - ppanggolin/meta/meta.py | 2 +- ppanggolin/pangenome.py | 74 ++++++++++++++-------------- ppanggolin/projection/projection.py | 19 ++++--- ppanggolin/utils.py | 10 ++-- tests/test_pangenome.py | 24 ++++----- 15 files changed, 84 insertions(+), 94 deletions(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 28a6bc83..e95852ac 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -278,10 +278,10 @@ def project_and_write_partition(seqid_to_gene_family: Dict[str, GeneFamily], seq partition_proj = output.absolute() / "sequences_partition_projection.tsv" with open(partition_proj, "w") as partProjFile: - for input_seq, pangFam in seqid_to_gene_family.items(): - partProjFile.write(input_seq + "\t" + pangFam.named_partition + "\n") - for remainingSeq in seq_set - seqid_to_gene_family.keys(): - partProjFile.write(remainingSeq + "\tcloud\n") # if there is no hit, it's going to be cloud genes. + for input_seq, gene_fam in seqid_to_gene_family.items(): + partProjFile.write(input_seq + "\t" + gene_fam.named_partition + "\n") + for remaining_seq in seq_set - seqid_to_gene_family.keys(): + partProjFile.write(remaining_seq + "\tcloud\n") # if there is no hit, it's going to be cloud genes. return partition_proj @@ -298,11 +298,11 @@ def write_gene_to_gene_family(seqid_to_gene_family: Dict[str, GeneFamily], seq_s gene_fam_map_file = output.absolute() / "gene_to_gene_family.tsv" with open(gene_fam_map_file, "w") as partProjFile: - for input_seq, pangFam in seqid_to_gene_family.items(): - partProjFile.write(f"{input_seq}\t{pangFam.name}\n") + for input_seq, gene_fam in seqid_to_gene_family.items(): + partProjFile.write(f"{input_seq}\t{gene_fam.name}\n") - for remainingSeq in seq_set - seqid_to_gene_family.keys(): - partProjFile.write(f"{remainingSeq}\t{remainingSeq}\n") # if there is no hit, gene family is itself. + for remaining_seq in seq_set - seqid_to_gene_family.keys(): + partProjFile.write(f"{remaining_seq}\t{remaining_seq}\n") # if there is no hit, gene family is itself. return gene_fam_map_file @@ -517,9 +517,8 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Itera def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: float = 0.8, coverage: float = 0.8, no_defrag: bool = False, cpu: int = 1, getinfo: bool = False, - use_representatives: bool = False, - draw_related: bool = False, translation_table: int = 11, tmpdir: Path = None, - disable_bar: bool = False, keep_tmp=False): + use_representatives: bool = False, draw_related: bool = False, translation_table: int = 11, + tmpdir: Path = None, disable_bar: bool = False, keep_tmp=False): """ Aligns pangenome sequences with sequences in a FASTA file using MMSeqs2. diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 4382ba43..4ed3ad2e 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -309,8 +309,6 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool = newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir) tmp_path = Path(newtmpdir.name) - # newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir) - # tmp_path = Path(newtmpdir.name) with open(tmp_path/'nucleotid_sequences', "w") as sequence_file: check_pangenome_for_clustering(pangenome, sequence_file, force, disable_bar=disable_bar) logging.getLogger("PPanGGOLiN").info("Clustering all of the genes sequences...") diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index f675682b..d40b38c3 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -647,10 +647,10 @@ def draw_spots(pangenome: Pangenome, output: Path, spot_list: str, disable_bar: need_rgp=True, need_spots=True, need_modules=need_mod, disable_bar=disable_bar) if spot_list == 'all' or any(x == 'all' for x in spot_list): - logging.getLogger("PPanGGOLiN").debug(f"'all' value is found in spot list, all spots are drawn.") + logging.getLogger("PPanGGOLiN").debug("'all' value is found in spot list, all spots are drawn.") selected_spots = list(pangenome.spots) elif spot_list == "synteny" or any(x == 'synteny' for x in spot_list): - logging.getLogger().debug(f"'synteny' value is found in spot list, all spots with more than 1 conserved synteny are drawn.") + logging.getLogger().debug("'synteny' value is found in spot list, all spots with more than 1 conserved synteny are drawn.") selected_spots = [s for s in pangenome.spots if len(s.get_uniq_ordered_set()) > 1] else: curated_spot_list = {'spot_' + str(s) if not s.startswith("spot_") else str(s) for s in spot_list} diff --git a/ppanggolin/figures/tile_plot.py b/ppanggolin/figures/tile_plot.py index 762d978d..e0ff3f14 100644 --- a/ppanggolin/figures/tile_plot.py +++ b/ppanggolin/figures/tile_plot.py @@ -173,7 +173,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di shapes=shapes, plot_bgcolor='#ffffff') logging.getLogger("PPanGGOLiN").info("Drawing the figure itself...") - #fig = go.Figure(data=[heatmap], layout=layout) + fig = go.Figure(data=[heatmap]) fig.add_trace(go.Scatter(x=dendro_org['icoord'], diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 97421c5d..e12170dd 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -852,7 +852,7 @@ def write_gff_file(org: Organism, contig_to_rgp: Dict[Contig, Region], score = '.' - if type(feature) == Gene: + if isinstance(feature, Gene): rgp = feature.RGP.name if feature.RGP else "" attributes += [ ("Family", feature.family.name), @@ -879,7 +879,7 @@ def write_gff_file(org: Organism, contig_to_rgp: Dict[Contig, Region], line_str = '\t'.join(map(str, gene_line)) outfile.write(line_str + "\n") - elif type(feature) == Region: + elif isinstance(feature, Region): feat_type = "region" source = "ppanggolin" strand = "." @@ -913,7 +913,7 @@ def write_gff_file(org: Organism, contig_to_rgp: Dict[Contig, Region], if genome_sequences: logging.getLogger("PPanGGOLiN").debug("Writing fasta section of gff file...") - outfile.write(f"##FASTA\n") + outfile.write("##FASTA\n") for contig in sorted_contigs: outfile.write(f">{contig.name}\n") diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py index 6c7363f5..af189edd 100644 --- a/ppanggolin/formats/writeMSA.py +++ b/ppanggolin/formats/writeMSA.py @@ -237,14 +237,12 @@ def write_whole_genome_msa(pangenome: Pangenome, families: set, phylo_name: str, genome_id = "" seq = "" curr_len = 0 - dup_gene = 0 # TODO Remove ? curr_phylo_dict = {} for line in fin: if line.startswith('>'): if genome_id != "": if genome_id not in missing_genomes: - dup_gene += 1 # duplicated genes. Replacing them with gaps. curr_phylo_dict[genome_id] = "-" * curr_len else: diff --git a/ppanggolin/formats/writeMetadata.py b/ppanggolin/formats/writeMetadata.py index 59c3d7c4..0cb7e2cb 100644 --- a/ppanggolin/formats/writeMetadata.py +++ b/ppanggolin/formats/writeMetadata.py @@ -124,7 +124,7 @@ def get_metadata_len(select_elem: List[Module], source: str) -> Tuple[Dict[str, if isinstance(value, float) or isinstance(value, int): if attr in type_dict: if type_dict[attr] != type(value): - if type(value) == float and type_dict[attr] == int: + if isinstance(value, float) and isinstance(type_dict[attr], int): type_dict[attr] = tables.Float64Col() else: if isinstance(value, float): diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index acbface1..6ad32c29 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -18,7 +18,7 @@ from ppanggolin.utils import write_compressed_or_not, mk_outdir, read_compressed_or_not, restricted_float, detect_filetype from ppanggolin.formats.readBinaries import check_pangenome_info, get_gene_sequences_from_file -module_regex = re.compile(r'^module_[0-9]+') +module_regex = re.compile(r'^module_\d+') #\d == [0-9] poss_values = ['all', 'persistent', 'shell', 'cloud', 'rgp', 'softcore', 'core', module_regex] poss_values_log = f"Possible values are {', '.join(poss_values[:-1])}, module_X with X being a module id." diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py index a6e05198..a8a8f3ff 100644 --- a/ppanggolin/geneFamily.py +++ b/ppanggolin/geneFamily.py @@ -341,16 +341,16 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): """ self.bitarray = gmpy2.xmpz() # pylint: disable=no-member if partition == 'all': - logging.getLogger("PPanGGOLiN").debug(f"all") + logging.getLogger("PPanGGOLiN").debug("all") for org in self.organisms: self.bitarray[index[org]] = 1 elif partition in ['shell', 'cloud']: - logging.getLogger("PPanGGOLiN").debug(f"shell, cloud") + logging.getLogger("PPanGGOLiN").debug("shell, cloud") if self.named_partition == partition: for org in self.organisms: self.bitarray[index[org]] = 1 elif partition == 'accessory': - logging.getLogger("PPanGGOLiN").debug(f"accessory") + logging.getLogger("PPanGGOLiN").debug("accessory") if self.named_partition in ['shell', 'cloud']: for org in self.organisms: self.bitarray[index[org]] = 1 diff --git a/ppanggolin/graph/makeGraph.py b/ppanggolin/graph/makeGraph.py index c9272a60..69557705 100644 --- a/ppanggolin/graph/makeGraph.py +++ b/ppanggolin/graph/makeGraph.py @@ -110,9 +110,7 @@ def compute_neighbors_graph(pangenome: Pangenome, remove_copy_number: int = 0, pangenome.status["neighborsGraph"] = "Computed" pangenome.parameters["graph"] = {} - # pangenome.parameters["graph"]["removed_high_copy_number_families"] = False if remove_copy_number > 0: - # pangenome.parameters["graph"]["removed_high_copy_number_families"] = True pangenome.parameters["graph"]["remove_high_copy_number"] = remove_copy_number diff --git a/ppanggolin/meta/meta.py b/ppanggolin/meta/meta.py index 44205b4a..f7506599 100644 --- a/ppanggolin/meta/meta.py +++ b/ppanggolin/meta/meta.py @@ -65,7 +65,7 @@ def check_metadata_format(metadata: Path, metatype: str) -> pd.DataFrame: :return: Dataframe with metadata loaded """ assert metatype in ["families", "genomes", "contigs", "genes", "RGPs", "spots", "modules"] - colname_check = re.compile('^[a-zA-Z_][a-zA-Z0-9_]*$') + colname_check = re.compile('^[a-zA-Z_]\w*$') # \w = [A-Za-z0-9_] metadata_df = pd.read_csv(metadata, sep="\t", header=0, quoting=csv.QUOTE_NONE, dtype={metatype: str}) metadata_df.replace(to_replace='-', value=pd.NA, inplace=True) diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index bf72e1f8..7727193f 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -29,15 +29,15 @@ def __init__(self): self.file = None # basic parameters - self._famGetter = {} + self._fam_getter = {} self._org_index = None self._fam_index = None self._max_fam_id = 0 - self._orgGetter = {} - self._edgeGetter = {} - self._regionGetter = {} - self._spotGetter = {} - self._moduleGetter = {} + self._org_getter = {} + self._edge_getter = {} + self._region_getter = {} + self._spot_getter = {} + self._module_getter = {} self.status = { 'genomesAnnotated': "No", 'geneSequences': "No", @@ -103,16 +103,16 @@ def genes(self) -> Generator[Gene, None, None]: def _mk_gene_getter(self): """ - Builds the attribute _geneGetter of the pangenome + Builds the attribute _gene_getter of the pangenome Since the genes are never explicitly 'added' to a pangenome (but rather to a gene family, or a contig), the pangenome cannot directly extract a gene from a geneID since it does not 'know' them. - If at some point we want to extract genes from a pangenome we'll create a geneGetter. + If at some point we want to extract genes from a pangenome we'll create a gene_getter. The assumption behind this is that the pangenome has been filled and no more gene will be added. """ - self._geneGetter = {} + self._gene_getter = {} for gene in self.genes: - self._geneGetter[gene.ID] = gene + self._gene_getter[gene.ID] = gene def get_gene(self, gene_id: str) -> Gene: """Returns the gene that has the given gene ID @@ -127,7 +127,7 @@ def get_gene(self, gene_id: str) -> Gene: assert isinstance(gene_id, str), "Gene id should be an integer" try: - return self._geneGetter[gene_id] + return self._gene_getter[gene_id] except AttributeError: # in that case, either the gene getter has not been computed, or the geneID is not in the pangenome. self._mk_gene_getter() # make it @@ -142,10 +142,10 @@ def number_of_genes(self) -> int: :return: The number of genes """ try: - return len(self._geneGetter) + return len(self._gene_getter) except AttributeError: # in that case the gene getter has not been computed self._mk_gene_getter() # make it - return len(self._geneGetter) + return len(self._gene_getter) """RNAs methods""" @property @@ -187,7 +187,7 @@ def gene_families(self) -> Generator[GeneFamily, None, None]: :return: Generator of gene families """ - for family in self._famGetter.values(): + for family in self._fam_getter.values(): yield family @property @@ -196,7 +196,7 @@ def number_of_gene_families(self) -> int: :return: The number of gene families """ - return len(self._famGetter) + return len(self._fam_getter) def get_gene_family(self, name: str) -> GeneFamily: """Returns the gene family that has the given `name` @@ -210,7 +210,7 @@ def get_gene_family(self, name: str) -> GeneFamily: """ assert isinstance(name, str), "Name of gene family should be a string" try: - fam = self._famGetter[name] + fam = self._fam_getter[name] except KeyError: raise KeyError(f"Gene family with name={name} is not in pangenome") except Exception as error: @@ -231,7 +231,7 @@ def add_gene_family(self, family: GeneFamily): try: _ = self.get_gene_family(family.name) except KeyError: - self._famGetter[family.name] = family + self._fam_getter[family.name] = family self.max_fam_id += 1 except Exception as error: raise Exception(error) @@ -245,7 +245,7 @@ def edges(self) -> Generator[Edge, None, None]: :return: Generator of edge """ - for edge in self._edgeGetter.values(): + for edge in self._edge_getter.values(): yield edge def add_edge(self, gene1: Gene, gene2: Gene) -> Edge: @@ -267,10 +267,10 @@ def add_edge(self, gene1: Gene, gene2: Gene) -> Edge: raise AttributeError("Genes are not linked to families. Check that you compute the gene families and post an" " issue on our GitHub") key = frozenset([family_1, family_2 ]) - edge = self._edgeGetter.get(key) + edge = self._edge_getter.get(key) if edge is None: edge = Edge(gene1, gene2) - self._edgeGetter[key] = edge + self._edge_getter[key] = edge else: edge.add_genes(gene1, gene2) return edge @@ -281,7 +281,7 @@ def number_of_edges(self) -> int: :return: The number of gene families """ - return len(self._edgeGetter) + return len(self._edge_getter) """Organism methods""" @property @@ -290,7 +290,7 @@ def organisms(self) -> Generator[Organism, None, None]: :return: Generator :class:`ppanggolin.genome.Organism` """ - for organism in self._orgGetter.values(): + for organism in self._org_getter.values(): yield organism @property @@ -299,7 +299,7 @@ def number_of_organisms(self) -> int: :return: The number of organism """ - return len(self._orgGetter) + return len(self._org_getter) @property def contigs(self) -> Generator[Contig, None, None]: @@ -377,7 +377,7 @@ def get_organism(self, name: str) -> Organism: """ assert isinstance(name, str), "Organism name should be a string" try: - return self._orgGetter[name] + return self._org_getter[name] except KeyError: raise KeyError(f"{name} does not seem to be in your pangenome") @@ -397,7 +397,7 @@ def add_organism(self, organism: Organism): try: self.get_organism(organism.name) except KeyError: - self._orgGetter[organism.name] = organism + self._org_getter[organism.name] = organism else: raise KeyError(f"Redondant organism name was found ({organism.name})." f"All of your organisms must have unique names.") @@ -469,7 +469,7 @@ def regions(self) -> Generator[Region, None, None]: :return: list of RGP """ - for region in self._regionGetter.values(): + for region in self._region_getter.values(): yield region def get_region(self, name: str) -> Region: @@ -485,7 +485,7 @@ def get_region(self, name: str) -> Region: assert isinstance(name, str), "RGP name should be a string" try: - rgp = self._regionGetter[name] + rgp = self._region_getter[name] except KeyError: # then the region is not stored in this pangenome. raise KeyError(f"There is no RGP with name={name}") else: @@ -526,7 +526,7 @@ def add_region(self, region: Region): try: self.get_region(region.name) except KeyError: - self._regionGetter[region.name] = region + self._region_getter[region.name] = region else: raise KeyError(f"A RGP with this name ({region.name} already exist in pangenome") @@ -536,7 +536,7 @@ def number_of_rgp(self) -> int: :return: The number of gene families """ - return len(self._regionGetter) + return len(self._region_getter) """Spot methods""" @property @@ -544,7 +544,7 @@ def spots(self) -> Generator[Spot, None, None]: """Generate spots in the pangenome :return: Spot generator""" - yield from self._spotGetter.values() + yield from self._spot_getter.values() def get_spot(self, spot_id: Union[int, str]) -> Spot: # TODO Change for only str or only int @@ -568,7 +568,7 @@ def get_spot(self, spot_id: Union[int, str]) -> Spot: raise ValueError(f"The provided spot ID '{spot_id}' does not have the expected format." "It should be an integer or in the format 'spot_'.") try: - spot = self._spotGetter[spot_id] + spot = self._spot_getter[spot_id] except KeyError: raise KeyError(f"Spot {spot_id} does not exist in the pangenome.") else: @@ -586,7 +586,7 @@ def add_spot(self, spot: Spot): try: self.get_spot(spot.ID) except KeyError: - self._spotGetter[spot.ID] = spot + self._spot_getter[spot.ID] = spot except Exception as error: raise Exception(error) else: @@ -598,14 +598,14 @@ def number_of_spots(self) -> int: :return: The number of gene families """ - return len(self._spotGetter) + return len(self._spot_getter) """Modules methods""" @property def modules(self) -> Generator[Module, None, None]: """Generate modules in the pangenome """ - yield from self._moduleGetter.values() + yield from self._module_getter.values() def get_module(self, module_id: Union[int, str]) -> Module: # TODO Change for only str or only int @@ -631,7 +631,7 @@ def get_module(self, module_id: Union[int, str]) -> Module: "It should be an integer or in the format 'module_'.") try: - module = self._moduleGetter[module_id] + module = self._module_getter[module_id] except KeyError: raise KeyError(f"Module {module_id} does not exist in the pangenome.") else: @@ -649,7 +649,7 @@ def add_module(self, module: Module): try: self.get_module(module.ID) except KeyError: - self._moduleGetter[module.ID] = module + self._module_getter[module.ID] = module except Exception as error: raise Exception(error) else: @@ -680,7 +680,7 @@ def number_of_modules(self) -> int: :return: The number of modules """ - return len(self._moduleGetter) + return len(self._module_getter) """Metadata""" def select_elem(self, metatype: str): diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 3bd6a42a..1059aac7 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -70,7 +70,7 @@ def launch(args: argparse.Namespace): pangenome.add_file(args.pangenome) if pangenome.status["partitioned"] not in ["Computed", "Loaded", "inFile"]: - raise NameError(f"The provided pangenome has not been partitioned. " + raise NameError("The provided pangenome has not been partitioned. " "Annotation of an external genome is therefore not possible. " "See the 'partition' subcommands.") @@ -125,7 +125,6 @@ def launch(args: argparse.Namespace): single_copy_fams.add(fam) - # genome_name_to_fasta_path, genome_name_to_annot_path = None, None genome_name_to_path = None if args.input_mode == "multiple": @@ -451,7 +450,7 @@ def write_summaries(organism_2_summary: Dict[Organism, Dict[str, Any]], output_d flat_summary = {} for key, val in summary_info.items(): - if type(val) == dict: + if isinstance(val, dict): for nest_k, nest_v in val.items(): flat_summary[f"{key} {nest_k}"] = nest_v else: @@ -545,17 +544,17 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org """ seq_fasta_files = [] - logging.getLogger('PPanGGOLiN').info(f'Writting gene sequences of input genomes.') + logging.getLogger('PPanGGOLiN').info('Writting gene sequences of input genomes.') for input_organism in input_organisms: seq_outdir = output / input_organism.name mk_outdir(seq_outdir, force=True) - seq_fasta_file = seq_outdir / f"cds_sequences.fasta" + seq_fasta_file = seq_outdir / "cds_sequences.fasta" with open(seq_fasta_file, "w") as fh_out_faa: - write_gene_sequences_from_annotations(input_organism.genes, fh_out_faa, disable_bar=True, add=f"ppanggolin_") + write_gene_sequences_from_annotations(input_organism.genes, fh_out_faa, disable_bar=True, add="ppanggolin_") seq_fasta_files.append(seq_fasta_file) @@ -854,7 +853,7 @@ def predict_spots_in_input_organisms( initial_regions: List[Region], input_org_2_rgps: Dict[Organism, Set[Region]], multigenics: Set[GeneFamily], - output: str, + output: Path, write_graph_flag: bool = False, graph_formats: List[str] = ['gexf'], overlapping_match: int = 2, @@ -877,7 +876,7 @@ def predict_spots_in_input_organisms( :return: A dictionary mapping input organism RGPs to their predicted spots. """ - logging.getLogger("PPanGGOLiN").debug(f"Rebuilding original spot graph.") + logging.getLogger("PPanGGOLiN").debug("Rebuilding original spot graph.") graph_spot = make_spot_graph(rgps=initial_regions, multigenics=multigenics, overlapping_match=overlapping_match, set_size=set_size, exact_match=exact_match) @@ -965,7 +964,7 @@ def predict_spot_in_one_organism( "as they are on a contig border (or have " f"less than {set_size} persistent gene families until the contig border). " "Projection of spots stops here") - return {} + return set() # remove node that were already in the graph new_nodes = set(input_org_node_to_rgps) - original_nodes @@ -1173,7 +1172,7 @@ def check_projection_arguments(args: argparse.Namespace, parser: argparse.Argume if args.circular_contigs: parser.error("You provided a TSV file listing the files of genomes you wish to annotate. " - f"Therefore, the argument '--circular_contigs' is incompatible with this multiple genomes file.") + "Therefore, the argument '--circular_contigs' is incompatible with this multiple genomes file.") if args.fasta: check_input_files(args.fasta, True) diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index 07e6bc75..e6bc20cb 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -483,7 +483,7 @@ def get_arg_name(arg_val: Union[str, TextIOWrapper]) -> Union[str, TextIOWrapper :return: Either a string or a TextIOWrapper object, depending on the type of the input argument. """ - if type(arg_val) == TextIOWrapper: + if isinstance(arg_val, TextIOWrapper): return arg_val.name return arg_val @@ -733,7 +733,7 @@ def count_different_values(values: Iterable[Union[int, str, Tuple, List]]) -> in """ hashable_values = set() for value in values: - hashable_value = tuple(value) if type(value) == list else value + hashable_value = tuple(value) if isinstance(value, list) else value hashable_values.add(hashable_value) return len(hashable_values) @@ -765,7 +765,7 @@ def set_up_config_param_to_parser(config_param_val: dict) -> list: arguments_to_parse = [] for param, val in config_param_val.items(): - if type(val) == bool or val is None or val == "None": + if isinstance(val, bool) or val is None or val == "None": # param is a flag if val is True: arguments_to_parse.append(f"--{param}") @@ -773,7 +773,7 @@ def set_up_config_param_to_parser(config_param_val: dict) -> list: else: arguments_to_parse.append(f"--{param}") - if type(val) == list: + if isinstance(val, list): # range of values need to be added one by one arguments_to_parse += [str(v) for v in val] else: @@ -906,8 +906,6 @@ def get_cli_args(subparser_fct: Callable) -> argparse.Namespace: # remove argument that have not been specified delete_unspecified_args(cli_args) delattr(cli_args, 'subcommand') - # if 'config' in cli_args: - # delattr(cli_args, 'config') return cli_args diff --git a/tests/test_pangenome.py b/tests/test_pangenome.py index 8b1123ad..d67eb1ce 100644 --- a/tests/test_pangenome.py +++ b/tests/test_pangenome.py @@ -39,15 +39,15 @@ def test_cstr(self, pangenome): """ pangenome_attr_type = { "file": type(None), - "_famGetter": dict, + "_fam_getter": dict, "_org_index": type(None), "_fam_index": type(None), "_max_fam_id": int, - "_orgGetter": dict, - "_edgeGetter": dict, - "_regionGetter": dict, - "_spotGetter": dict, - "_moduleGetter": dict, + "_org_getter": dict, + "_edge_getter": dict, + "_region_getter": dict, + "_spot_getter": dict, + "_module_getter": dict, "status": dict, "parameters": dict } @@ -597,8 +597,8 @@ def test_add_region(self, pangenome): """ rgp = Region(name="rgp") pangenome.add_region(rgp) - assert len(pangenome._regionGetter) == 1 - assert pangenome._regionGetter["rgp"] == rgp + assert len(pangenome._region_getter) == 1 + assert pangenome._region_getter["rgp"] == rgp def test_add_region_already_in_pangenome(self, pangenome): """Tests that adding region already in pangenome return a KeyError. @@ -665,8 +665,8 @@ def test_add_spot(self, pangenome): """ spot = Spot(spot_id=0) pangenome.add_spot(spot) - assert len(pangenome._spotGetter) == 1 - assert pangenome._spotGetter[0] == spot + assert len(pangenome._spot_getter) == 1 + assert pangenome._spot_getter[0] == spot def test_add_spot_already_in_pangenome(self, pangenome): """Tests that adding spot already in pangenome return a KeyError. @@ -734,8 +734,8 @@ def test_add_module(self, pangenome): """ module = Module(module_id=0) pangenome.add_module(module) - assert len(pangenome._moduleGetter) == 1 - assert pangenome._moduleGetter[0] == module + assert len(pangenome._module_getter) == 1 + assert pangenome._module_getter[0] == module def test_add_module_already_in_pangenome(self, pangenome): """Tests that adding module already in pangenome return a KeyError.