From 705d14ec30d85a0a19d930cbbad02a63079d746f Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 25 Apr 2023 14:42:00 +0200 Subject: [PATCH 01/25] add window size parameter in ppanggolin context --- ppanggolin/context/searchGeneContext.py | 66 ++++++++++++++++++------- 1 file changed, 49 insertions(+), 17 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 75e2350c..e1b91282 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -6,6 +6,9 @@ import tempfile import time import logging +import os +from typing import List, Dict, Tuple + # installed libraries from tqdm import tqdm @@ -23,7 +26,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: str, sequences: str = None, families: str = None, transitive: int = 4, identity: float = 0.5, - coverage: float = 0.8, jaccard: float = 0.85, no_defrag: bool = False, + coverage: float = 0.8, jaccard: float = 0.85, window_size: int = 1, no_defrag: bool = False, cpu: int = 1, disable_bar=True): """ Main function to search common gene contexts between sequence set and pangenome families @@ -37,6 +40,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: :param identity: minimum identity threshold between sequences and gene families for the alignment :param coverage: minimum coverage threshold between sequences and gene families for the alignment :param jaccard: Jaccard index to filter edges in graph + :param window_size: Number of genes to consider in the gene context. :param no_defrag: do not use the defrag workflow if true :param cpu: Number of core used to process :param disable_bar: Allow preventing bar progress print @@ -45,7 +49,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: # check statuses and load info if sequences is not None and pangenome.status["geneFamilySequences"] not in ["inFile", "Loaded", "Computed"]: raise Exception("Cannot use this function as your pangenome does not have gene families representatives " - "associated to it. For now this works only if the clustering is realised by PPanGGOLiN.") + "associated to it. For now this works only if the clustering has been made by PPanGGOLiN.") check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) gene_families = {} @@ -56,9 +60,12 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: seq_set, _, seq2pan = get_seq2pang(pangenome, sequences, output, new_tmpdir, cpu, no_defrag, identity, coverage) project_partition(seq2pan, seq_set, output) + new_tmpdir.cleanup() - for k, v in seq2pan.items(): - gene_families[v.name] = v + + for pan_family in seq2pan.values(): + gene_families[pan_family.name] = pan_family + fam_2_seq = fam2seq(seq2pan) if families is not None: @@ -66,16 +73,20 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: for fam_name in f.read().splitlines(): gene_families[fam_name] = pangenome.get_gene_family(fam_name) + half_window = round((window_size-1)/2) + logging.info(f'Window size of {half_window*2 + 1}. Gene context will include {half_window} genes on each side of the target gene.') + # Compute the graph with transitive closure size provided as parameter start_time = time.time() logging.getLogger().info("Building the graph...") - g = compute_gene_context_graph(families=gene_families, t=transitive, disable_bar=disable_bar) + gene_context_graph = compute_gene_context_graph(families=gene_families, t=transitive, half_window=half_window, disable_bar=disable_bar) logging.getLogger().info( f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find common gene contexts") - logging.getLogger().debug(f"There are {nx.number_of_nodes(g)} nodes and {nx.number_of_edges(g)} edges") - + logging.getLogger().debug(f"There are {nx.number_of_nodes(gene_context_graph)} nodes and {nx.number_of_edges(gene_context_graph)} edges") + + # extract the modules from the graph - common_components = compute_gene_context(g, jaccard) + common_components = compute_gene_context(gene_context_graph, jaccard) families = set() for gene_context in common_components: @@ -89,12 +100,19 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: logging.getLogger().info(f"Computing gene contexts took {round(time.time() - start_time, 2)} seconds") -def compute_gene_context_graph(families: dict, t: int = 4, disable_bar: bool = False) -> nx.Graph: + + # for e, data in gene_context_graph(data=True): + + # nx.write_graphml_lxml(gene_context_graph, os.path.join(output, "context.graphml")) + + +def compute_gene_context_graph(families: dict, t: int = 4, half_window: int = 0, disable_bar: bool = False) -> nx.Graph: """ Construct the graph of gene contexts between families of the pan :param families: Gene families of interest :param t: transitive value + :param half_window: An integer specifying the number of genes to include in the context on each side of the gene of interest. :param disable_bar: Prevents progress bar printing :return: Graph of gene contexts between interesting gene families of the pan @@ -104,7 +122,7 @@ def compute_gene_context_graph(families: dict, t: int = 4, disable_bar: bool = F for family in tqdm(families.values(), unit="families", disable=disable_bar): for gene in family.genes: contig = gene.contig.genes - pos_left, in_context_left, pos_right, in_context_right = extract_gene_context(gene, contig, families, t) + pos_left, in_context_left, pos_right, in_context_right = extract_gene_context(gene, contig, families, t, half_window) if in_context_left or in_context_right: for env_gene in contig[pos_left:pos_right + 1]: _compute_gene_context_graph(g, env_gene, contig, pos_right) @@ -133,29 +151,40 @@ def _compute_gene_context_graph(g: nx.Graph, env_gene: Gene, contig: Contig, pos pos += 1 -def extract_gene_context(gene: Gene, contig: list, families: dict, t: int = 4) -> (int, bool, int, bool): + +def extract_gene_context(gene: Gene, contig: List[Gene], families: Dict[str, str], t: int = 4, half_window: int = 0) -> Tuple[int, bool, int, bool]: """ - Extract gene context and whether said gene context exists + Determine the left and rigth position of the gene context and whether said gene context exists. :param gene: Gene of interest :param contig: list of genes in contig :param families: Alignment results :param t: transitive value + :param half_window: An integer specifying the number of genes to include in the context on each side of the gene of interest. :return: Position of the context and if it exists for each side ('left' and 'right') """ - pos_left, pos_right = (max(0, gene.position - t), - min(gene.position + t, len(contig) - 1)) # Gene positions to compare family + search_window = max(t, half_window) + + pos_left, pos_right = (max(0, gene.position - search_window), + min(gene.position + search_window, len(contig) - 1)) # Gene positions to compare family + in_context_left, in_context_right = (False, False) while pos_left < gene.position and not in_context_left: - if contig[pos_left].family in families.values(): + if gene.position - pos_left <= half_window: + # position is in the window + in_context_left = True + + elif contig[pos_left].family in families.values(): in_context_left = True else: pos_left += 1 while pos_right > gene.position and not in_context_right: - if contig[pos_right].family in families.values(): + if pos_right - gene.position <= half_window: + in_context_right = True + elif contig[pos_right].family in families.values(): in_context_right = True else: pos_right -= 1 @@ -243,7 +272,7 @@ def launch(args: argparse.Namespace): pangenome.add_file(args.pangenome) search_gene_context_in_pangenome(pangenome=pangenome, output=args.output, tmpdir=args.tmpdir, sequences=args.sequences, families=args.family, transitive=args.transitive, - identity=args.identity, coverage=args.coverage, jaccard=args.jaccard, + identity=args.identity, coverage=args.coverage, jaccard=args.jaccard, window_size=args.window_size, no_defrag=args.no_defrag, cpu=args.cpu, disable_bar=args.disable_prog_bar) @@ -291,6 +320,9 @@ def parser_context(parser: argparse.ArgumentParser): help="Size of the transitive closure used to build the graph. This indicates the number of " "non related genes allowed in-between two related genes. Increasing it will improve " "precision but lower sensitivity a little.") + optional.add_argument("-w", "--window_size", required=False, type=int, default=1, + help="Number of genes adjacent to a gene of interest to consider in the gene context even if they are non related genes.") + optional.add_argument("-s", "--jaccard", required=False, type=restricted_float, default=0.85, help="minimum jaccard similarity used to filter edges between gene families. Increasing it " "will improve precision but lower sensitivity a lot.") From 1ab7f5d124cd887b6e2eabffa8a3c475ea6a51c5 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 3 May 2023 18:03:07 +0200 Subject: [PATCH 02/25] Add new fct extract_contig_window and tests --- ppanggolin/context/searchGeneContext.py | 37 +++++++++++++++++++++++-- tests/context/test_context.py | 27 ++++++++++++++++++ 2 files changed, 61 insertions(+), 3 deletions(-) create mode 100644 tests/context/test_context.py diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index e1b91282..1c4bdc62 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -7,8 +7,8 @@ import time import logging import os -from typing import List, Dict, Tuple - +from typing import List, Dict, Tuple, Iterable +from itertools import zip_longest # installed libraries from tqdm import tqdm @@ -106,6 +106,37 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: # nx.write_graphml_lxml(gene_context_graph, os.path.join(output, "context.graphml")) + +def extract_contig_window(contig_length: int, positions_of_interest: Iterable[int], window_size: int): + """ + Extracts contiguous windows around positions of interest within a contig. + + :param contig_length: The length of the contig. + :param positions_of_interest: An iterable containing the positions of interest. + :param window_size: The size of the window to extract around each position of interest. + :return: Yields tuples representing the start and end positions of each contiguous window. + """ + + sorted_positions = sorted(positions_of_interest) + + if sorted_positions[0] <0 or sorted_positions[-1] >= contig_length: + raise IndexError(f'Positions of interest are out of range. ' + f"Contig length is {contig_length} while given min={sorted_positions[0]} & max={sorted_positions[-1]} positions") + + start_po = max(sorted_positions[0] - window_size, 0) + + for position, next_po in zip_longest(sorted_positions, sorted_positions[1:]): + if next_po is None: + end_po = min(position + window_size, contig_length-1) + yield start_po, end_po + break + + if position + window_size < next_po - window_size: + end_po = min(position + window_size, contig_length-1) + yield start_po, end_po + start_po = max(next_po - window_size, 0) + + def compute_gene_context_graph(families: dict, t: int = 4, half_window: int = 0, disable_bar: bool = False) -> nx.Graph: """ Construct the graph of gene contexts between families of the pan @@ -321,7 +352,7 @@ def parser_context(parser: argparse.ArgumentParser): "non related genes allowed in-between two related genes. Increasing it will improve " "precision but lower sensitivity a little.") optional.add_argument("-w", "--window_size", required=False, type=int, default=1, - help="Number of genes adjacent to a gene of interest to consider in the gene context even if they are non related genes.") + help="Number of neighboring genes that are considered when searching for conserved genomic contexts around a gene of interest.") optional.add_argument("-s", "--jaccard", required=False, type=restricted_float, default=0.85, help="minimum jaccard similarity used to filter edges between gene families. Increasing it " diff --git a/tests/context/test_context.py b/tests/context/test_context.py new file mode 100644 index 00000000..a62b51dd --- /dev/null +++ b/tests/context/test_context.py @@ -0,0 +1,27 @@ +import pytest +from ppanggolin.context.searchGeneContext import extract_contig_window + + +def test_extract_contig_window(): + assert list(extract_contig_window(contig_length=15, positions_of_interest={8}, window_size=1)) == [(7,9)] + + # check that extracted window is inside contig limit + assert list(extract_contig_window(contig_length=16, positions_of_interest={15}, window_size=4)) == [(11,15)] + + assert list(extract_contig_window(contig_length=10, positions_of_interest={2, 8}, window_size=2)) == [(0,4), (6,9)] + + assert list(extract_contig_window(contig_length=10, positions_of_interest={2, 5, 8}, window_size=2)) == [(0,9)] + + # # check that circularity is properly taken into account + # assert list(extract_contig_window(contig_length=11, positions_of_interest={0}, window_size=2, is_circular=True)) == [(9,2)] + + # assert list(extract_contig_window(contig_length=11, positions_of_interest={0, 9}, window_size=2, is_circular=True)) == [(7,2)] + + # assert list(extract_contig_window(contig_length=11, positions_of_interest={0, 9}, window_size=2, is_circular=False)) == [(0,2), (7,11)] + + + with pytest.raises(IndexError): + list(extract_contig_window(contig_length=15, positions_of_interest={15}, window_size=1)) + + with pytest.raises(IndexError): + list(extract_contig_window(contig_length=15, positions_of_interest={-1}, window_size=1)) \ No newline at end of file From 3ee7def3190d3d612574cd4cdca7bde5fe53f344 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 4 May 2023 13:20:38 +0200 Subject: [PATCH 03/25] add possibility to process circular contig --- ppanggolin/context/searchGeneContext.py | 37 +++++++++++++++++++++---- tests/context/test_context.py | 32 ++++++++++++++------- 2 files changed, 53 insertions(+), 16 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 1c4bdc62..762c6137 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -106,35 +106,60 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: # nx.write_graphml_lxml(gene_context_graph, os.path.join(output, "context.graphml")) - -def extract_contig_window(contig_length: int, positions_of_interest: Iterable[int], window_size: int): +def extract_contig_window(contig_length: int, positions_of_interest: Iterable[int], window_size: int, is_circular:bool = False): """ Extracts contiguous windows around positions of interest within a contig. :param contig_length: The length of the contig. :param positions_of_interest: An iterable containing the positions of interest. :param window_size: The size of the window to extract around each position of interest. + :param is_circular: Indicates if the contig is circular. :return: Yields tuples representing the start and end positions of each contiguous window. """ + windows_coordinates = [] + # Sort the positions of interest sorted_positions = sorted(positions_of_interest) + # Check if any position of interest is out of range if sorted_positions[0] <0 or sorted_positions[-1] >= contig_length: raise IndexError(f'Positions of interest are out of range. ' f"Contig length is {contig_length} while given min={sorted_positions[0]} & max={sorted_positions[-1]} positions") + first_position = sorted_positions[0] + last_position = sorted_positions[-1] + if is_circular: + # in a circular contig, if the window of a gene of interest overlaps the end/start of the contig + # an out of scope position is added to the sorted positions to take into account those positions + # the returned window are always checked that its positions are not out of range... + # so there's no chance to find an out of scope position in final list + if first_position - window_size < 0: + out_of_scope_position = (contig_length ) + first_position + sorted_positions.append(out_of_scope_position) + + if last_position + window_size >= contig_length : + out_of_scope_position = contig_length-1 - (last_position + window_size) + sorted_positions.insert(0, out_of_scope_position) + start_po = max(sorted_positions[0] - window_size, 0) for position, next_po in zip_longest(sorted_positions, sorted_positions[1:]): + if next_po is None: + # If there are no more positions, add the final window end_po = min(position + window_size, contig_length-1) - yield start_po, end_po - break + windows_coordinates.append((start_po, end_po)) - if position + window_size < next_po - window_size: + elif position + window_size +1 < next_po - window_size: + # If there is a gap between positions, add the current window + # and update the start position for the next window end_po = min(position + window_size, contig_length-1) - yield start_po, end_po + + windows_coordinates.append((start_po, end_po)) + start_po = max(next_po - window_size, 0) + + return windows_coordinates def compute_gene_context_graph(families: dict, t: int = 4, half_window: int = 0, disable_bar: bool = False) -> nx.Graph: diff --git a/tests/context/test_context.py b/tests/context/test_context.py index a62b51dd..8597444d 100644 --- a/tests/context/test_context.py +++ b/tests/context/test_context.py @@ -3,25 +3,37 @@ def test_extract_contig_window(): - assert list(extract_contig_window(contig_length=15, positions_of_interest={8}, window_size=1)) == [(7,9)] + assert extract_contig_window(contig_length=15, positions_of_interest={8}, window_size=1) == [(7,9)] # check that extracted window is inside contig limit - assert list(extract_contig_window(contig_length=16, positions_of_interest={15}, window_size=4)) == [(11,15)] + assert extract_contig_window(contig_length=16, positions_of_interest={15}, window_size=4) == [(11,15)] - assert list(extract_contig_window(contig_length=10, positions_of_interest={2, 8}, window_size=2)) == [(0,4), (6,9)] + assert extract_contig_window(contig_length=10, positions_of_interest={2, 8}, window_size=2) == [(0,4), (6,9)] + + # 12 window is (9,15) + # 19 window is (16,22) + # so when 12 and 19 are of interest window merge (9,22) + assert extract_contig_window(contig_length=200, positions_of_interest={12}, window_size=3) == [(9,15)] + assert extract_contig_window(contig_length=200, positions_of_interest={19}, window_size=3) == [(16,22)] + assert extract_contig_window(contig_length=200, positions_of_interest={12, 19}, window_size=3) == [(9,22)] - assert list(extract_contig_window(contig_length=10, positions_of_interest={2, 5, 8}, window_size=2)) == [(0,9)] + assert extract_contig_window(contig_length=10, positions_of_interest={2, 5, 8}, window_size=2) == [(0,9)] +def test_extract_contig_window_with_circular_contig(): # # check that circularity is properly taken into account - # assert list(extract_contig_window(contig_length=11, positions_of_interest={0}, window_size=2, is_circular=True)) == [(9,2)] + assert extract_contig_window(contig_length=12, positions_of_interest={1}, window_size=2, is_circular=True) == [(0,3), (11,11)] + assert extract_contig_window(contig_length=12, positions_of_interest={1}, window_size=3, is_circular=True) == [(0,4), (10,11)] + assert extract_contig_window(contig_length=12, positions_of_interest={10}, window_size=3, is_circular=True) == [(0,1), (7,11)] - # assert list(extract_contig_window(contig_length=11, positions_of_interest={0, 9}, window_size=2, is_circular=True)) == [(7,2)] - - # assert list(extract_contig_window(contig_length=11, positions_of_interest={0, 9}, window_size=2, is_circular=False)) == [(0,2), (7,11)] + assert list(extract_contig_window(contig_length=12, positions_of_interest={6}, window_size=6, is_circular=True)) == [(0,11)] + assert list(extract_contig_window(contig_length=12, positions_of_interest={1}, window_size=6, is_circular=True)) == [(0,11)] + assert list(extract_contig_window(contig_length=12, positions_of_interest={1}, window_size=6, is_circular=False)) == [(0,7)] + assert list(extract_contig_window(contig_length=12, positions_of_interest={0, 9}, window_size=2, is_circular=False)) == [(0,2), (7,11)] +def test_extract_contig_window_out_of_range(): with pytest.raises(IndexError): - list(extract_contig_window(contig_length=15, positions_of_interest={15}, window_size=1)) + extract_contig_window(contig_length=15, positions_of_interest={15}, window_size=1) with pytest.raises(IndexError): - list(extract_contig_window(contig_length=15, positions_of_interest={-1}, window_size=1)) \ No newline at end of file + extract_contig_window(contig_length=15, positions_of_interest={-1}, window_size=1) \ No newline at end of file From 2c2e7818402c85e30052da851cce275217825b50 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 4 May 2023 17:39:46 +0200 Subject: [PATCH 04/25] fix wrong limit computation in extract_contig_window --- ppanggolin/context/searchGeneContext.py | 29 ++++++++++--------- tests/context/test_context.py | 38 +++++++++++-------------- 2 files changed, 33 insertions(+), 34 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 762c6137..6755bb35 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -8,7 +8,7 @@ import logging import os from typing import List, Dict, Tuple, Iterable -from itertools import zip_longest +from itertools import zip_longest, chain # installed libraries from tqdm import tqdm @@ -52,8 +52,10 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: "associated to it. For now this works only if the clustering has been made by PPanGGOLiN.") check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) + gene_families = {} fam_2_seq = None + if sequences is not None: # Alignment of sequences on pangenome families new_tmpdir = tempfile.TemporaryDirectory(dir=tmpdir) @@ -106,11 +108,12 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: # nx.write_graphml_lxml(gene_context_graph, os.path.join(output, "context.graphml")) -def extract_contig_window(contig_length: int, positions_of_interest: Iterable[int], window_size: int, is_circular:bool = False): + +def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int], window_size: int, is_circular:bool = False): """ Extracts contiguous windows around positions of interest within a contig. - :param contig_length: The length of the contig. + :param contig_size: Number of genes in contig. :param positions_of_interest: An iterable containing the positions of interest. :param window_size: The size of the window to extract around each position of interest. :param is_circular: Indicates if the contig is circular. @@ -122,23 +125,23 @@ def extract_contig_window(contig_length: int, positions_of_interest: Iterable[in sorted_positions = sorted(positions_of_interest) # Check if any position of interest is out of range - if sorted_positions[0] <0 or sorted_positions[-1] >= contig_length: + if sorted_positions[0] <0 or sorted_positions[-1] >= contig_size: raise IndexError(f'Positions of interest are out of range. ' - f"Contig length is {contig_length} while given min={sorted_positions[0]} & max={sorted_positions[-1]} positions") - - first_position = sorted_positions[0] - last_position = sorted_positions[-1] + f"Contig has {contig_size} genes while given min={sorted_positions[0]} & max={sorted_positions[-1]} positions") + if is_circular: + first_position = sorted_positions[0] + last_position = sorted_positions[-1] # in a circular contig, if the window of a gene of interest overlaps the end/start of the contig # an out of scope position is added to the sorted positions to take into account those positions # the returned window are always checked that its positions are not out of range... # so there's no chance to find an out of scope position in final list if first_position - window_size < 0: - out_of_scope_position = (contig_length ) + first_position + out_of_scope_position = (contig_size ) + first_position sorted_positions.append(out_of_scope_position) - if last_position + window_size >= contig_length : - out_of_scope_position = contig_length-1 - (last_position + window_size) + if last_position + window_size >= contig_size : + out_of_scope_position = last_position - contig_size sorted_positions.insert(0, out_of_scope_position) start_po = max(sorted_positions[0] - window_size, 0) @@ -147,13 +150,13 @@ def extract_contig_window(contig_length: int, positions_of_interest: Iterable[in if next_po is None: # If there are no more positions, add the final window - end_po = min(position + window_size, contig_length-1) + end_po = min(position + window_size, contig_size-1) windows_coordinates.append((start_po, end_po)) elif position + window_size +1 < next_po - window_size: # If there is a gap between positions, add the current window # and update the start position for the next window - end_po = min(position + window_size, contig_length-1) + end_po = min(position + window_size, contig_size-1) windows_coordinates.append((start_po, end_po)) diff --git a/tests/context/test_context.py b/tests/context/test_context.py index 8597444d..7e5fff07 100644 --- a/tests/context/test_context.py +++ b/tests/context/test_context.py @@ -1,39 +1,35 @@ import pytest -from ppanggolin.context.searchGeneContext import extract_contig_window +from ppanggolin.context.searchGeneContext import extract_contig_window, get_n_next_genes_index + def test_extract_contig_window(): - assert extract_contig_window(contig_length=15, positions_of_interest={8}, window_size=1) == [(7,9)] + assert extract_contig_window(contig_size=15, positions_of_interest={8}, window_size=1) == [(7,9)] # check that extracted window is inside contig limit - assert extract_contig_window(contig_length=16, positions_of_interest={15}, window_size=4) == [(11,15)] + assert extract_contig_window(contig_size=16, positions_of_interest={15}, window_size=4) == [(11,15)] - assert extract_contig_window(contig_length=10, positions_of_interest={2, 8}, window_size=2) == [(0,4), (6,9)] + assert extract_contig_window(contig_size=10, positions_of_interest={2, 8}, window_size=2) == [(0,4), (6,9)] # 12 window is (9,15) # 19 window is (16,22) # so when 12 and 19 are of interest window merge (9,22) - assert extract_contig_window(contig_length=200, positions_of_interest={12}, window_size=3) == [(9,15)] - assert extract_contig_window(contig_length=200, positions_of_interest={19}, window_size=3) == [(16,22)] - assert extract_contig_window(contig_length=200, positions_of_interest={12, 19}, window_size=3) == [(9,22)] + assert extract_contig_window(contig_size=200, positions_of_interest={12}, window_size=3) == [(9,15)] + assert extract_contig_window(contig_size=200, positions_of_interest={19}, window_size=3) == [(16,22)] + assert extract_contig_window(contig_size=200, positions_of_interest={12, 19}, window_size=3) == [(9,22)] - assert extract_contig_window(contig_length=10, positions_of_interest={2, 5, 8}, window_size=2) == [(0,9)] + assert extract_contig_window(contig_size=10, positions_of_interest={2, 5, 8}, window_size=2) == [(0,9)] def test_extract_contig_window_with_circular_contig(): # # check that circularity is properly taken into account - assert extract_contig_window(contig_length=12, positions_of_interest={1}, window_size=2, is_circular=True) == [(0,3), (11,11)] - assert extract_contig_window(contig_length=12, positions_of_interest={1}, window_size=3, is_circular=True) == [(0,4), (10,11)] - assert extract_contig_window(contig_length=12, positions_of_interest={10}, window_size=3, is_circular=True) == [(0,1), (7,11)] - - assert list(extract_contig_window(contig_length=12, positions_of_interest={6}, window_size=6, is_circular=True)) == [(0,11)] - assert list(extract_contig_window(contig_length=12, positions_of_interest={1}, window_size=6, is_circular=True)) == [(0,11)] - assert list(extract_contig_window(contig_length=12, positions_of_interest={1}, window_size=6, is_circular=False)) == [(0,7)] + assert extract_contig_window(contig_size=12, positions_of_interest={1}, window_size=2, is_circular=True) == [(0,3), (11,11)] + assert extract_contig_window(contig_size=12, positions_of_interest={1}, window_size=3, is_circular=True) == [(0,4), (10,11)] + assert extract_contig_window(contig_size=12, positions_of_interest={10}, window_size=3, is_circular=True) == [(0,1), (7,11)] - assert list(extract_contig_window(contig_length=12, positions_of_interest={0, 9}, window_size=2, is_circular=False)) == [(0,2), (7,11)] + assert extract_contig_window(contig_size=12, positions_of_interest={6}, window_size=6, is_circular=True) == [(0,11)] + assert extract_contig_window(contig_size=12, positions_of_interest={1}, window_size=6, is_circular=True) == [(0,11)] + assert extract_contig_window(contig_size=12, positions_of_interest={1}, window_size=6, is_circular=False) == [(0,7)] -def test_extract_contig_window_out_of_range(): - with pytest.raises(IndexError): - extract_contig_window(contig_length=15, positions_of_interest={15}, window_size=1) + assert extract_contig_window(contig_size=12, positions_of_interest={0, 9}, window_size=2, is_circular=False) == [(0,2), (7,11)] - with pytest.raises(IndexError): - extract_contig_window(contig_length=15, positions_of_interest={-1}, window_size=1) \ No newline at end of file + assert extract_contig_window(contig_size=894, positions_of_interest=[151, 152, 153, 893], window_size=4, is_circular=True) == [(0, 3), (147, 157), (889, 893)] From f3a55f2fedce13185127b7d4946a4a0f7e452d37 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 4 May 2023 17:43:11 +0200 Subject: [PATCH 05/25] add fct and test to get next gene index in a circular aware fashion --- ppanggolin/context/searchGeneContext.py | 15 ++++++++++++++- tests/context/test_context.py | 22 ++++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 6755bb35..d1cbd041 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -107,7 +107,20 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: # nx.write_graphml_lxml(gene_context_graph, os.path.join(output, "context.graphml")) - +def get_n_next_genes_index(current_index:int, next_genes_count:int, contig_size:int, is_circular:bool = False): + # Check if any position of interest is out of range + if current_index >= contig_size: + raise IndexError(f'current gene index is out of range. ' + f"Contig has {contig_size} genes while the given gene index is {current_index}") + if is_circular: + next_genes = chain(range(current_index+1, contig_size), range(0, current_index)) + else: + next_genes = range(current_index+1, contig_size) + + for i, next_gene_index in enumerate(next_genes): + if i == next_genes_count: + break + yield next_gene_index def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int], window_size: int, is_circular:bool = False): """ diff --git a/tests/context/test_context.py b/tests/context/test_context.py index 7e5fff07..cf6c5fb1 100644 --- a/tests/context/test_context.py +++ b/tests/context/test_context.py @@ -33,3 +33,25 @@ def test_extract_contig_window_with_circular_contig(): assert extract_contig_window(contig_size=12, positions_of_interest={0, 9}, window_size=2, is_circular=False) == [(0,2), (7,11)] assert extract_contig_window(contig_size=894, positions_of_interest=[151, 152, 153, 893], window_size=4, is_circular=True) == [(0, 3), (147, 157), (889, 893)] + +def test_extract_contig_window_out_of_range(): + with pytest.raises(IndexError): + extract_contig_window(contig_size=15, positions_of_interest={15}, window_size=1) + + with pytest.raises(IndexError): + extract_contig_window(contig_size=15, positions_of_interest={-1}, window_size=1) + +def test_get_n_next_genes_index(): + + assert list(get_n_next_genes_index(current_index=6, next_genes_count=3, contig_size=100, is_circular=False)) == [7, 8, 9] + + # there is no next gene because the current index is at the end of a non cicurclar contig + assert list(get_n_next_genes_index(current_index=11, next_genes_count=2, contig_size=12, is_circular=False)) == [] + +def test_get_n_next_genes_index_circular(): + assert list(get_n_next_genes_index(current_index=10, next_genes_count=3, contig_size=12, is_circular=True)) == [11, 0, 1] + assert list(get_n_next_genes_index(current_index=10, next_genes_count=16, contig_size=12, is_circular=True)) == [11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + +def test_get_n_next_genes_index_out_of_range(): + with pytest.raises(IndexError): + assert list(get_n_next_genes_index(current_index=10, next_genes_count=16, contig_size=8, is_circular=False)) From cdcc4e1da5245cd225442a6aa99b7af1e182fcd5 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 4 May 2023 19:47:54 +0200 Subject: [PATCH 06/25] add fct add_edges_to_context_graph with simple test --- ppanggolin/context/searchGeneContext.py | 48 +++++++++++++++++++++++++ tests/context/test_context.py | 47 +++++++++++++++++++++++- 2 files changed, 94 insertions(+), 1 deletion(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index d1cbd041..b645e993 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -106,7 +106,55 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: # for e, data in gene_context_graph(data=True): # nx.write_graphml_lxml(gene_context_graph, os.path.join(output, "context.graphml")) + +def add_edges_to_context_graph(context_graph: nx.Graph, + contig_genes: Iterable[Gene], + contig_windows: List[Tuple[int, int]], + t: int, + is_circular: bool, + disable_bar: bool = False): + print('WINDOWS to search contig in ', contig_windows) + for window_start, window_end in contig_windows: + print('IN WINDOW ', window_start, window_end ) + for gene_index in range(window_start, window_end +1): + print("- CURRENT GENE INDEX", gene_index) + gene = contig_genes[gene_index] + next_genes = get_n_next_genes_index(gene_index, next_genes_count=t, + contig_size=len(contig_genes), is_circular=is_circular) + next_genes = list(next_genes) + print('- next gene index:', next_genes) + + for next_gene_index in next_genes: + print('-- current next_gene_index', next_gene_index) + # check that next gene is in contig windows.. + if not any(lower <= next_gene_index <= upper for (lower, upper) in contig_windows): + # next_gene_index is not in any range of genes in the context + # so it is ignore and all folowing genes as well + break + + next_gene = contig_genes[next_gene_index] + if next_gene.family == gene.family: + # if next gene has the same family, the two gene refer to the same node + # so they are ignored.. + print('gene and next gene families are identical.. continue ') + continue + + context_graph.add_edge(gene.family, next_gene.family) + print(f"-- ADD edge between families", gene.family.name, next_gene.family.name) + + try: + context_graph[gene.family][next_gene.family][gene.family].add(gene) + except KeyError: + context_graph[gene.family][next_gene.family][gene.family] = {gene} + try: + context_graph[gene.family][next_gene.family][next_gene.family].add(next_gene) + except KeyError: + context_graph[gene.family][next_gene.family][next_gene.family] = {next_gene} + + + + def get_n_next_genes_index(current_index:int, next_genes_count:int, contig_size:int, is_circular:bool = False): # Check if any position of interest is out of range if current_index >= contig_size: diff --git a/tests/context/test_context.py b/tests/context/test_context.py index cf6c5fb1..fdd435af 100644 --- a/tests/context/test_context.py +++ b/tests/context/test_context.py @@ -1,6 +1,10 @@ import pytest -from ppanggolin.context.searchGeneContext import extract_contig_window, get_n_next_genes_index +from ppanggolin.context.searchGeneContext import extract_contig_window, get_n_next_genes_index, add_edges_to_context_graph +from ppanggolin.geneFamily import GeneFamily +from ppanggolin.genome import Gene, Contig + +import networkx as nx def test_extract_contig_window(): @@ -55,3 +59,44 @@ def test_get_n_next_genes_index_circular(): def test_get_n_next_genes_index_out_of_range(): with pytest.raises(IndexError): assert list(get_n_next_genes_index(current_index=10, next_genes_count=16, contig_size=8, is_circular=False)) + +@pytest.fixture() +def simple_contig(): + + contig = Contig(name="contig1", is_circular=False) + + contig_size=6 + genes = [Gene(i) for i in range(contig_size)] + + for i, (gene, family_name) in enumerate(zip(genes, 'ABCDEFGHIJKLMNOP')): + family = GeneFamily(i, family_name) + gene.fill_annotations(start=0, stop=0, strand=0, position=i) + + contig.add_gene(gene) + family.add_gene(gene) + + return contig + + + +def test_add_edges_to_context_graph(simple_contig): + context_graph = nx.Graph() + + #simple_contig families : ABCDEF + + add_edges_to_context_graph(context_graph, + contig_genes = simple_contig.genes, + contig_windows = [(0,3)], + t=2, + is_circular=simple_contig.is_circular) + + nodes = sorted([n.name for n in context_graph.nodes()]) + edges = {tuple(sorted([n.name, v.name])) for n, v in context_graph.edges()} + + assert nodes == ['A', "B", "C", "D"] + assert edges == {('A', 'B'), + ('A', 'C'), + ('B', 'C'), + ('B', 'D'), + ('C', 'D')} + \ No newline at end of file From d6131ca1cb98df927675907aa4af19adc9da0d19 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 5 May 2023 10:57:43 +0200 Subject: [PATCH 07/25] add test and clean fct add_edges_to_context_graph --- ppanggolin/context/searchGeneContext.py | 89 ++++++++++++++++--------- tests/context/test_context.py | 83 +++++++++++++++++++++++ 2 files changed, 141 insertions(+), 31 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index b645e993..a98b1062 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -7,7 +7,7 @@ import time import logging import os -from typing import List, Dict, Tuple, Iterable +from typing import List, Dict, Tuple, Iterable, Hashable from itertools import zip_longest, chain # installed libraries @@ -102,57 +102,84 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: logging.getLogger().info(f"Computing gene contexts took {round(time.time() - start_time, 2)} seconds") - - # for e, data in gene_context_graph(data=True): - - # nx.write_graphml_lxml(gene_context_graph, os.path.join(output, "context.graphml")) - def add_edges_to_context_graph(context_graph: nx.Graph, contig_genes: Iterable[Gene], contig_windows: List[Tuple[int, int]], t: int, - is_circular: bool, - disable_bar: bool = False): - - print('WINDOWS to search contig in ', contig_windows) + is_circular: bool): + """ + Add edges to the context graph based on contig genes and windows. + + :param context_graph: The context graph to which edges will be added. + :param contig_genes: An iterable of genes in the contig. + :param contig_windows: A list of tuples representing the start and end positions of contig windows. + :param t: The number of next genes to consider when adding edges. + :param is_circular: A boolean indicating if the contig is circular. + + """ for window_start, window_end in contig_windows: - print('IN WINDOW ', window_start, window_end ) - for gene_index in range(window_start, window_end +1): - print("- CURRENT GENE INDEX", gene_index) + for gene_index in range(window_start, window_end + 1): gene = contig_genes[gene_index] next_genes = get_n_next_genes_index(gene_index, next_genes_count=t, - contig_size=len(contig_genes), is_circular=is_circular) + contig_size=len(contig_genes), is_circular=is_circular) next_genes = list(next_genes) - print('- next gene index:', next_genes) - + for next_gene_index in next_genes: - print('-- current next_gene_index', next_gene_index) - # check that next gene is in contig windows.. + # Check if the next gene is within the contig windows if not any(lower <= next_gene_index <= upper for (lower, upper) in contig_windows): # next_gene_index is not in any range of genes in the context - # so it is ignore and all folowing genes as well + # so it is ignored along with all following genes break next_gene = contig_genes[next_gene_index] if next_gene.family == gene.family: - # if next gene has the same family, the two gene refer to the same node - # so they are ignored.. - print('gene and next gene families are identical.. continue ') + # If the next gene has the same family, the two genes refer to the same node + # so they are ignored continue context_graph.add_edge(gene.family, next_gene.family) - print(f"-- ADD edge between families", gene.family.name, next_gene.family.name) + + # Add edge attributes + edge_dict = context_graph[gene.family][next_gene.family] + add_val_to_edge_attribute(edge_dict, gene.family, gene) + add_val_to_edge_attribute(edge_dict, next_gene.family, next_gene) + + add_val_to_edge_attribute(edge_dict, "organisms", gene.organism) + + update_edge_attribute_counter(edge_dict, "gene_pairs") - try: - context_graph[gene.family][next_gene.family][gene.family].add(gene) - except KeyError: - context_graph[gene.family][next_gene.family][gene.family] = {gene} - try: - context_graph[gene.family][next_gene.family][next_gene.family].add(next_gene) - except KeyError: - context_graph[gene.family][next_gene.family][next_gene.family] = {next_gene} + assert gene.organism == next_gene.organism +def add_val_to_edge_attribute(edge_dict: dict, attribute_key, attribute_value): + """ + Add an edge attribute value to the edge dictionary set. + + :param edge_dict: The dictionary containing the edge attributes. + :param attribute_key: The key of the attribute. + :param attribute_value: The value of the attribute to be added. + + """ + + try: + edge_dict[attribute_key].add(attribute_value) + except KeyError: + edge_dict[attribute_key] = {attribute_value} + + +def update_edge_attribute_counter(edge_dict: dict, key:Hashable): + """ + Update the counter for an edge attribute in the edge dictionary. + + :param edge_dict: The dictionary containing the edge attributes. + :param key: The key of the attribute. + + """ + + try: + edge_dict[key] += 1 + except KeyError: + edge_dict[key] = 1 def get_n_next_genes_index(current_index:int, next_genes_count:int, contig_size:int, is_circular:bool = False): diff --git a/tests/context/test_context.py b/tests/context/test_context.py index fdd435af..264cbf8f 100644 --- a/tests/context/test_context.py +++ b/tests/context/test_context.py @@ -77,6 +77,23 @@ def simple_contig(): return contig +@pytest.fixture() +def simple_circular_contig(): + + contig = Contig(name="contig2", is_circular=True) + + contig_size=6 + genes = [Gene(i) for i in range(contig_size)] + + for i, (gene, family_name) in enumerate(zip(genes, 'ABCDEFGHIJKLMNOP')): + family = GeneFamily(i, family_name) + gene.fill_annotations(start=0, stop=0, strand=0, position=i) + + contig.add_gene(gene) + family.add_gene(gene) + + return contig + def test_add_edges_to_context_graph(simple_contig): @@ -99,4 +116,70 @@ def test_add_edges_to_context_graph(simple_contig): ('B', 'C'), ('B', 'D'), ('C', 'D')} + +def test_add_edges_to_context_graph_2(simple_contig): + context_graph = nx.Graph() + + #simple_contig families : A B-C-D E F + + add_edges_to_context_graph(context_graph, + contig_genes = simple_contig.genes, + contig_windows = [(1,3)], + t=1, + is_circular=simple_contig.is_circular) + + nodes = sorted([n.name for n in context_graph.nodes()]) + edges = {tuple(sorted([n.name, v.name])) for n, v in context_graph.edges()} + + assert nodes == ["B", "C", "D"] + assert edges == {('B', 'C'), + ('C', 'D')} + +def test_add_edges_to_context_graph_linear(simple_contig): + + # genes : 1-2-3-4-5-6 + # families : A-B-C-D-E-F + # windows : _____ ___ [(0,2) (4,5)] + + + context_graph = nx.Graph() + + add_edges_to_context_graph(context_graph, + contig_genes = simple_contig.genes, + contig_windows = [(4,5), (0,2)], + t=1, + is_circular=False) + + nodes = sorted([n.name for n in context_graph.nodes()]) + edges = {tuple(sorted([n.name, v.name])) for n, v in context_graph.edges()} + + assert nodes == ["A", "B", "C", "E", "F"] + assert edges == {('A', 'B'), + ('B', 'C'), + ('E', "F"), + } + + +def test_add_edges_to_context_graph_circular(simple_contig): + + # genes : 1-2-3-4-5-6 + # families : A-B-C-D-E-F + # windows : _____ ___ [(0,2) (4,5)] + + context_graph = nx.Graph() + + add_edges_to_context_graph(context_graph, + contig_genes = simple_contig.genes, + contig_windows = [(4,5), (0,2)], + t=1, + is_circular=True) + + nodes = sorted([n.name for n in context_graph.nodes()]) + edges = {tuple(sorted([n.name, v.name])) for n, v in context_graph.edges()} + + assert nodes == ["A", "B", "C", "E", "F"] + assert edges == {('A', 'B'), + ('B', 'C'), + ('E', "F"), + ('A', 'F')} # circular so F and A are linked \ No newline at end of file From a0732f57910a6306ebfd9e757fdf1c1ee253d70f Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 5 May 2023 11:05:02 +0200 Subject: [PATCH 08/25] add docstring and type to function compute_gene_context_graph --- ppanggolin/context/searchGeneContext.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index a98b1062..ac385963 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -7,7 +7,7 @@ import time import logging import os -from typing import List, Dict, Tuple, Iterable, Hashable +from typing import List, Dict, Tuple, Iterable, Hashable, Iterator from itertools import zip_longest, chain # installed libraries @@ -182,8 +182,22 @@ def update_edge_attribute_counter(edge_dict: dict, key:Hashable): edge_dict[key] = 1 -def get_n_next_genes_index(current_index:int, next_genes_count:int, contig_size:int, is_circular:bool = False): - # Check if any position of interest is out of range +def get_n_next_genes_index(current_index: int, next_genes_count: int, contig_size: int, is_circular: bool = False) -> Iterator[int]: + """ + Generate the indices of the next genes based on the current index and contig properties. + + :param current_index: The index of the current gene. + :param next_genes_count: The number of next genes to consider. + :param contig_size: The total number of genes in the contig. + :param is_circular: Flag indicating whether the contig is circular (default: False). + :return: An iterator yielding the indices of the next genes. + + Raises: + - IndexError: If the current index is out of range for the given contig size. + + """ + + # Check if the current index is out of range if current_index >= contig_size: raise IndexError(f'current gene index is out of range. ' f"Contig has {contig_size} genes while the given gene index is {current_index}") From 4725e9fcf06294e8e86880362ac8bc808f09be32 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 5 May 2023 11:51:09 +0200 Subject: [PATCH 09/25] add function compute_gene_context_graph and test --- ppanggolin/context/searchGeneContext.py | 123 +++++++++--------------- tests/context/test_context.py | 33 ++++++- 2 files changed, 77 insertions(+), 79 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index ac385963..f2f4b0d7 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -7,8 +7,9 @@ import time import logging import os -from typing import List, Dict, Tuple, Iterable, Hashable, Iterator +from typing import List, Dict, Tuple, Iterable, Hashable, Iterator, Set from itertools import zip_longest, chain +from collections import defaultdict # installed libraries from tqdm import tqdm @@ -18,10 +19,11 @@ # local libraries from ppanggolin.formats import check_pangenome_info from ppanggolin.genome import Gene, Contig -from ppanggolin.utils import mk_outdir, restricted_float, add_gene, connected_components +from ppanggolin.utils import mk_outdir, restricted_float, connected_components from ppanggolin.pangenome import Pangenome from ppanggolin.align.alignOnPang import get_seq2pang, project_partition from ppanggolin.region import GeneContext +from ppanggolin.geneFamily import GeneFamily def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: str, sequences: str = None, @@ -80,10 +82,14 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: # Compute the graph with transitive closure size provided as parameter start_time = time.time() + logging.getLogger().info("Building the graph...") + gene_context_graph = compute_gene_context_graph(families=gene_families, t=transitive, half_window=half_window, disable_bar=disable_bar) + logging.getLogger().info( f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find common gene contexts") + logging.getLogger().debug(f"There are {nx.number_of_nodes(gene_context_graph)} nodes and {nx.number_of_edges(gene_context_graph)} edges") @@ -266,91 +272,55 @@ def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int] return windows_coordinates - -def compute_gene_context_graph(families: dict, t: int = 4, half_window: int = 0, disable_bar: bool = False) -> nx.Graph: +def get_contig_to_genes(gene_families: Iterable[GeneFamily]) -> Dict[Contig, Set[Gene]]: """ - Construct the graph of gene contexts between families of the pan - - :param families: Gene families of interest - :param t: transitive value - :param half_window: An integer specifying the number of genes to include in the context on each side of the gene of interest. - :param disable_bar: Prevents progress bar printing + Group genes from specified gene families by contig. - :return: Graph of gene contexts between interesting gene families of the pan - """ - - g = nx.Graph() - for family in tqdm(families.values(), unit="families", disable=disable_bar): - for gene in family.genes: - contig = gene.contig.genes - pos_left, in_context_left, pos_right, in_context_right = extract_gene_context(gene, contig, families, t, half_window) - if in_context_left or in_context_right: - for env_gene in contig[pos_left:pos_right + 1]: - _compute_gene_context_graph(g, env_gene, contig, pos_right) - return g - - -def _compute_gene_context_graph(g: nx.Graph, env_gene: Gene, contig: Contig, pos_r: int): - """ - Compute graph of gene contexts between one gene and the other part of the contig - - :param: Graph of gene contexts between interesting gene families of the pan - :param env_gene: Gene of the current position - :param contig: Current contig to search a gene context - :param pos_r: Gene to search a gene context + :param gene_families: An iterable of gene families object. + + :return: A dictionary mapping contigs to sets of genes. """ - - g.add_node(env_gene.family) - add_gene(g.nodes[env_gene.family], env_gene, fam_split=False) - pos = env_gene.position + 1 - while pos <= pos_r: - if env_gene.family != contig[pos].family: - g.add_edge(env_gene.family, contig[pos].family) - edge = g[env_gene.family][contig[pos].family] - add_gene(edge, env_gene) - add_gene(edge, contig[pos]) - pos += 1 - + + contig_to_genes_of_interest = defaultdict(set) + for gene_family in gene_families: + for gene in gene_family.genes: + contig = gene.contig + contig_to_genes_of_interest[contig].add(gene) + return contig_to_genes_of_interest -def extract_gene_context(gene: Gene, contig: List[Gene], families: Dict[str, str], t: int = 4, half_window: int = 0) -> Tuple[int, bool, int, bool]: +def compute_gene_context_graph(families: Iterable[GeneFamily], transitive: int = 4, window_size: int = 0, disable_bar: bool = False) -> nx.Graph: """ - Determine the left and rigth position of the gene context and whether said gene context exists. + Construct the graph of gene contexts between families of the pangenome. - :param gene: Gene of interest - :param contig: list of genes in contig - :param families: Alignment results - :param t: transitive value - :param half_window: An integer specifying the number of genes to include in the context on each side of the gene of interest. + :param families: An iterable of gene families. + :param transitive: Size of the transitive closure used to build the graph. + :param window_size: Size of the window for extracting gene contexts (default: 0). + :param disable_bar: Flag to disable the progress bar (default: False). + :return: The constructed gene context graph. - :return: Position of the context and if it exists for each side ('left' and 'right') """ - search_window = max(t, half_window) - - pos_left, pos_right = (max(0, gene.position - search_window), - min(gene.position + search_window, len(contig) - 1)) # Gene positions to compare family + context_graph = nx.Graph() - in_context_left, in_context_right = (False, False) - while pos_left < gene.position and not in_context_left: - if gene.position - pos_left <= half_window: - # position is in the window - in_context_left = True - - elif contig[pos_left].family in families.values(): - in_context_left = True - else: - pos_left += 1 - - while pos_right > gene.position and not in_context_right: - if pos_right - gene.position <= half_window: - in_context_right = True - elif contig[pos_right].family in families.values(): - in_context_right = True - else: - pos_right -= 1 + contig_to_genes_of_interest = get_contig_to_genes(families) + + for contig, genes_of_interest in tqdm(contig_to_genes_of_interest.items(), unit="contig", total=len(contig_to_genes_of_interest), disable=disable_bar): + logging.debug(f'Processing {len(genes_of_interest)} genes of interest in contig {contig}') + + genes_count = len(contig.genes) + + genes_of_interest_positions = [g.position for g in genes_of_interest] - return pos_left, in_context_left, pos_right, in_context_right + contig_windows = extract_contig_window(genes_count, genes_of_interest_positions, + window_size=window_size, is_circular=contig.is_circular) + + add_edges_to_context_graph(context_graph, + contig.genes, + contig_windows, + transitive, + contig.is_circular) + return context_graph def compute_gene_context(g: nx.Graph, jaccard: float = 0.85) -> set: @@ -391,7 +361,8 @@ def fam2seq(seq_to_pan: dict) -> dict: def export_to_dataframe(families: set, gene_contexts: set, fam_to_seq: dict, output: str): - """ Export the results into dataFrame + """ + Export the results into dataFrame :param families: Families related to the connected components :param gene_contexts: connected components found in the pan diff --git a/tests/context/test_context.py b/tests/context/test_context.py index 264cbf8f..1fd45d62 100644 --- a/tests/context/test_context.py +++ b/tests/context/test_context.py @@ -1,5 +1,5 @@ import pytest -from ppanggolin.context.searchGeneContext import extract_contig_window, get_n_next_genes_index, add_edges_to_context_graph +from ppanggolin.context.searchGeneContext import extract_contig_window, get_n_next_genes_index, add_edges_to_context_graph, compute_gene_context_graph from ppanggolin.geneFamily import GeneFamily from ppanggolin.genome import Gene, Contig @@ -71,7 +71,9 @@ def simple_contig(): for i, (gene, family_name) in enumerate(zip(genes, 'ABCDEFGHIJKLMNOP')): family = GeneFamily(i, family_name) gene.fill_annotations(start=0, stop=0, strand=0, position=i) - + + gene.fill_parents("organism A", contig) + contig.add_gene(gene) family.add_gene(gene) @@ -182,4 +184,29 @@ def test_add_edges_to_context_graph_circular(simple_contig): ('B', 'C'), ('E', "F"), ('A', 'F')} # circular so F and A are linked - \ No newline at end of file + + +def test_compute_gene_context_graph(simple_contig): + + # genes : 0-1-2-3-4-5 + # families : A-B-C-D-E-F + # family of interest : ^ + # windows of 2 : ___ ___ + + # simple case with only one contig with 6 genes and 6 families + + families_in_contigs = [g.family for g in simple_contig.genes ] + family_names_of_interest = ["C"] + families_of_interest = {f for f in families_in_contigs if f.name in family_names_of_interest } + + context_graph = compute_gene_context_graph(families_of_interest, + transitive=1, + window_size = 2) + nodes = sorted([n.name for n in context_graph.nodes()]) + edges = {tuple(sorted([n.name, v.name])) for n, v in context_graph.edges()} + + assert nodes == ["A", "B", "C", "D", "E"] + assert edges == {('A', 'B'), + ('B', 'C'), + ('C', "D"), + ('D', 'E')} \ No newline at end of file From bb5ecbd8f022034f86253c7294f82902a09b3c37 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 9 May 2023 09:59:45 +0200 Subject: [PATCH 10/25] write graph to investigate found contexts --- ppanggolin/context/searchGeneContext.py | 98 +++++++++++++++++++------ 1 file changed, 77 insertions(+), 21 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index f2f4b0d7..110f88d2 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -28,7 +28,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: str, sequences: str = None, families: str = None, transitive: int = 4, identity: float = 0.5, - coverage: float = 0.8, jaccard: float = 0.85, window_size: int = 1, no_defrag: bool = False, + coverage: float = 0.8, jaccard_threshold: float = 0.85, window_size: int = 1, no_defrag: bool = False, cpu: int = 1, disable_bar=True): """ Main function to search common gene contexts between sequence set and pangenome families @@ -41,7 +41,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: :param transitive: number of genes to check on both sides of a family aligned with an input sequence :param identity: minimum identity threshold between sequences and gene families for the alignment :param coverage: minimum coverage threshold between sequences and gene families for the alignment - :param jaccard: Jaccard index to filter edges in graph + :param jaccard_threshold: Jaccard index threshold to filter edges in graph :param window_size: Number of genes to consider in the gene context. :param no_defrag: do not use the defrag workflow if true :param cpu: Number of core used to process @@ -55,7 +55,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) - gene_families = {} + gene_families = set() fam_2_seq = None if sequences is not None: @@ -68,45 +68,100 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: new_tmpdir.cleanup() for pan_family in seq2pan.values(): - gene_families[pan_family.name] = pan_family + gene_families.add(pan_family) fam_2_seq = fam2seq(seq2pan) if families is not None: with open(families, 'r') as f: for fam_name in f.read().splitlines(): - gene_families[fam_name] = pangenome.get_gene_family(fam_name) + gene_families.add(pangenome.get_gene_family(fam_name)) - half_window = round((window_size-1)/2) - logging.info(f'Window size of {half_window*2 + 1}. Gene context will include {half_window} genes on each side of the target gene.') + # half_window = round((window_size-1)/2) + # logging.info(f'Window size of {half_window*2 + 1}. Gene context will include {half_window} genes on each side of the target gene.') # Compute the graph with transitive closure size provided as parameter start_time = time.time() logging.getLogger().info("Building the graph...") - gene_context_graph = compute_gene_context_graph(families=gene_families, t=transitive, half_window=half_window, disable_bar=disable_bar) + gene_context_graph = compute_gene_context_graph(families=gene_families, transitive=transitive, window_size=window_size, disable_bar=disable_bar) logging.getLogger().info( f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find common gene contexts") logging.getLogger().debug(f"There are {nx.number_of_nodes(gene_context_graph)} nodes and {nx.number_of_edges(gene_context_graph)} edges") - + + compute_edge_metrics(gene_context_graph, jaccard_threshold) + + + write_graph(gene_context_graph, output, gene_families) # extract the modules from the graph - common_components = compute_gene_context(gene_context_graph, jaccard) + # common_components = compute_gene_context(gene_context_graph, jaccard) - families = set() - for gene_context in common_components: - families |= gene_context.families + # families = set() + # for gene_context in common_components: + # families |= gene_context.families - if len(families) != 0: - export_to_dataframe(families, common_components, fam_2_seq, output) - else: - logging.getLogger().info(f"No gene contexts were found") + # if len(families) != 0: + # export_to_dataframe(families, common_components, fam_2_seq, output) + # else: + # logging.getLogger().info(f"No gene contexts were found") logging.getLogger().info(f"Computing gene contexts took {round(time.time() - start_time, 2)} seconds") +def write_graph(context_graph:nx.Graph, output_dir:str, famillies_of_interest): + + def filter_edge_attribute(data): + return {k:v for k, v in data.items() if type(v) != set} + + G = nx.Graph() + + G.add_edges_from(((f1.name, f2.name) for f1,f2 in context_graph.edges())) + + edges_with_attributes = {(f1.name, f2.name):filter_edge_attribute(d) for f1,f2,d in context_graph.edges(data=True)} + + nx.set_edge_attributes(G, edges_with_attributes) + + nodes_data = {f.name:{"organisms":len(f.organisms), "genes":len(f.genes), "famillies_of_interest": f in famillies_of_interest} for f in context_graph.nodes()} + + for f, d in G.nodes(data=True): + d.update(nodes_data[f]) + + nx.write_graphml_lxml(G, os.path.join(output_dir, "graph_context.graphml")) + +def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff:float ): + # compute jaccard on organism + for f1, f2, data in context_graph.edges(data=True): + + data['jaccard_organism'] = len(data['organisms'])/len(f1.organisms | f2.organisms) + + data['min_jaccard_organism'] = len(data['organisms'])/min(len(f1.organisms), len(f2.organisms)) + data['max_jaccard_organism'] = len(data['organisms'])/max(len(f1.organisms), len(f2.organisms)) + + + # print("f1", f1) + # print("len data[f1]", len(data[f1])) + + # print('len(f1.genes)', len(f1.genes)) + + f1_gene_proportion = len(data[f1])/len(f1.genes) + f2_gene_proportion = len(data[f2])/len(f2.genes) + + data[f'f1'] = f1.name + data[f'f2'] = f2.name + data[f'f1_gene_proportion'] = f1_gene_proportion + data[f'f2_gene_proportion'] = f2_gene_proportion + + data[f'is_gene_proportion_>_{gene_proportion_cutoff}'] = (f1_gene_proportion >= gene_proportion_cutoff) and (f2_gene_proportion >= gene_proportion_cutoff) + + # for k, v in data.items(): + # if type(v) != set: + # print(k, v) + # print('===') + + def add_edges_to_context_graph(context_graph: nx.Graph, contig_genes: Iterable[Gene], @@ -297,8 +352,8 @@ def compute_gene_context_graph(families: Iterable[GeneFamily], transitive: int = :param transitive: Size of the transitive closure used to build the graph. :param window_size: Size of the window for extracting gene contexts (default: 0). :param disable_bar: Flag to disable the progress bar (default: False). - :return: The constructed gene context graph. + :return: The constructed gene context graph. """ context_graph = nx.Graph() @@ -404,7 +459,7 @@ def launch(args: argparse.Namespace): pangenome.add_file(args.pangenome) search_gene_context_in_pangenome(pangenome=pangenome, output=args.output, tmpdir=args.tmpdir, sequences=args.sequences, families=args.family, transitive=args.transitive, - identity=args.identity, coverage=args.coverage, jaccard=args.jaccard, window_size=args.window_size, + identity=args.identity, coverage=args.coverage, jaccard_threshold=args.jaccard, window_size=args.window_size, no_defrag=args.no_defrag, cpu=args.cpu, disable_bar=args.disable_prog_bar) @@ -452,8 +507,9 @@ def parser_context(parser: argparse.ArgumentParser): help="Size of the transitive closure used to build the graph. This indicates the number of " "non related genes allowed in-between two related genes. Increasing it will improve " "precision but lower sensitivity a little.") - optional.add_argument("-w", "--window_size", required=False, type=int, default=1, - help="Number of neighboring genes that are considered when searching for conserved genomic contexts around a gene of interest.") + optional.add_argument("-w", "--window_size", required=False, type=int, default=5, + help="Number of neighboring genes that are considered on each side of " + "a gene of interest when searching for conserved genomic contexts.") optional.add_argument("-s", "--jaccard", required=False, type=restricted_float, default=0.85, help="minimum jaccard similarity used to filter edges between gene families. Increasing it " From a6bb109c71679a139dab5101499d1ee3e8da65e7 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 12 May 2023 17:27:00 +0200 Subject: [PATCH 11/25] filter and output contexts in tsv and in graph --- ppanggolin/context/searchGeneContext.py | 281 ++++++++++++++++-------- 1 file changed, 192 insertions(+), 89 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 110f88d2..35470750 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -29,7 +29,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: str, sequences: str = None, families: str = None, transitive: int = 4, identity: float = 0.5, coverage: float = 0.8, jaccard_threshold: float = 0.85, window_size: int = 1, no_defrag: bool = False, - cpu: int = 1, disable_bar=True): + cpu: int = 1, write_context_graph:bool = False, disable_bar=True): """ Main function to search common gene contexts between sequence set and pangenome families @@ -45,6 +45,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: :param window_size: Number of genes to consider in the gene context. :param no_defrag: do not use the defrag workflow if true :param cpu: Number of core used to process + :param write_context_graph: Write graph of the contexts :param disable_bar: Allow preventing bar progress print """ @@ -90,77 +91,159 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: logging.getLogger().info( f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find common gene contexts") - logging.getLogger().debug(f"There are {nx.number_of_nodes(gene_context_graph)} nodes and {nx.number_of_edges(gene_context_graph)} edges") + logging.getLogger().debug(f"Context graph made of {nx.number_of_nodes(gene_context_graph)} nodes and {nx.number_of_edges(gene_context_graph)} edges") compute_edge_metrics(gene_context_graph, jaccard_threshold) + # Filter graph + filter_flag = f'is_jaccard_gene_>_{jaccard_threshold}' + + filtered_graph = nx.subgraph_view(gene_context_graph, filter_edge=lambda n1, n2: gene_context_graph[n1][n2][filter_flag] ) + + logging.getLogger().debug(f"Filtering context graph on {filter_flag}") + logging.getLogger().debug(f"Context graph made of {nx.number_of_nodes(filtered_graph)} nodes and {nx.number_of_edges(filtered_graph)} edges") + connected_components = nx.connected_components(filtered_graph) + + # Connected component graph Filtering + + # remove singleton famillies + connected_components = (component for component in connected_components if len(component) > 1) - write_graph(gene_context_graph, output, gene_families) + # remove component made only of famillies not initially requested + connected_components = (component for component in connected_components if component & gene_families) - # extract the modules from the graph - # common_components = compute_gene_context(gene_context_graph, jaccard) + gene_contexts = {GeneContext(gc_id=i, families=component) for i, component in enumerate(connected_components) } - # families = set() - # for gene_context in common_components: - # families |= gene_context.families + families_in_contexts = {family for gene_context in gene_contexts for family in gene_context.families} + + graph_with_final_contexts = nx.subgraph_view(gene_context_graph, filter_node=lambda n: n in families_in_contexts) + + if write_context_graph: + write_graph(graph_with_final_contexts, output, gene_families, gene_contexts) + + if len(families_in_contexts) != 0: + logging.getLogger().debug(f"There are {len(families_in_contexts)} families among {len(gene_contexts)} gene contexts") + + output_file = os.path.join(output, "gene_contexts.tsv") - # if len(families) != 0: - # export_to_dataframe(families, common_components, fam_2_seq, output) - # else: - # logging.getLogger().info(f"No gene contexts were found") + export_context_to_dataframe(gene_contexts, fam_2_seq, output_file) + + else: + logging.getLogger().info(f"No gene contexts were found") logging.getLogger().info(f"Computing gene contexts took {round(time.time() - start_time, 2)} seconds") -def write_graph(context_graph:nx.Graph, output_dir:str, famillies_of_interest): - def filter_edge_attribute(data): - return {k:v for k, v in data.items() if type(v) != set} - + # # Finding connected components with panmodule functions + # # extract the modules from the graph + + # logging.getLogger().debug(f"panmodule style:") + # gene_contexts_pandmodule_way = compute_gene_context(gene_context_graph, jaccard_threshold) + + # # remove singleton famillies + # gene_contexts_pandmodule_way = (context for context in gene_contexts_pandmodule_way if len(context.families) > 1 ) + + # # remove component made only of famillies not initially requested + # gene_contexts_pandmodule_way = [context for context in gene_contexts_pandmodule_way if context.families & gene_families] + + # families_in_contexts = {family for gene_context in gene_contexts_pandmodule_way for family in gene_context.families} + + # logging.getLogger().debug(f"There are {len(families_in_contexts)} families among {len(gene_contexts_pandmodule_way)} gene contexts") + + + # output_file = os.path.join(output, "gene_contexts_panmodule_style.tsv") + # export_context_to_dataframe(gene_contexts_pandmodule_way, fam_2_seq, output_file) + + + +def write_graph(context_graph: nx.Graph, output_dir: str, famillies_of_interest: Set[GeneFamily], gene_contexts:List[GeneContext]): + """ + Write a graph to file with node and edge attributes. + + This function writes a graph to a file in the GraphML format or in GEXF format. The original context graph contains + ppanggolin objects as nodes and lists and dictionaries in edge attributes. Since these objects + cannot be written to the output graph, this function creates a new graph that contains only + writable objects. + + :param context_graph: A NetworkX Graph object representing the graph. + :param output_dir: The output directory where the graph file will be written. + :param famillies_of_interest: A list of node objects that are of interest. + :param gene_contexts: List of gene context, used to add context id to node of the graph + + """ + def filter_attribute(data:dict): + """ + Helper function to filter the edge attributes. + + :param data: The edge attribute data. + :return: A filtered dictionary containing only non-collection attributes. + """ + return {k:v for k, v in data.items() if type(v) not in [set, dict, list]} + G = nx.Graph() G.add_edges_from(((f1.name, f2.name) for f1,f2 in context_graph.edges())) - edges_with_attributes = {(f1.name, f2.name):filter_edge_attribute(d) for f1,f2,d in context_graph.edges(data=True)} + edges_with_attributes = {(f1.name, f2.name):filter_attribute(d) for f1,f2,d in context_graph.edges(data=True)} nx.set_edge_attributes(G, edges_with_attributes) - - nodes_data = {f.name:{"organisms":len(f.organisms), "genes":len(f.genes), "famillies_of_interest": f in famillies_of_interest} for f in context_graph.nodes()} + nodes_attributes_filtered = {f.name:filter_attribute(d) for f,d in context_graph.nodes(data=True)} + + # on top of attributes already contained in node of context graph + # add organisms and genes count that have the family, the partition and if the family was in initially requested + nodes_family_data = {f.name:{"organisms":len(f.organisms), + "partition":f.partition, + "genes":len(f.genes), + "famillies_of_interest": f in famillies_of_interest} for f in context_graph.nodes()} + + family_name_to_context_id = {family.name:context.ID for context in gene_contexts for family in context.families} + for f, d in G.nodes(data=True): - d.update(nodes_data[f]) + d.update(nodes_family_data[f]) + d.update(nodes_attributes_filtered[f]) + d['context_id'] = family_name_to_context_id[f] + + + graphml_file = os.path.join(output_dir, "graph_context.graphml") + logging.info(f'Writting context graph in {graphml_file}') + nx.write_graphml_lxml(G, graphml_file) + + gexf_file = os.path.join(output_dir, "graph_context.gexf") + logging.info(f'Writting context graph in {gexf_file}') + nx.readwrite.gexf.write_gexf(G, gexf_file) - nx.write_graphml_lxml(G, os.path.join(output_dir, "graph_context.graphml")) -def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff:float ): - # compute jaccard on organism +def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff: float) -> None: + """ + Compute various metrics on the edges of the context graph. + + :param context_graph: The context graph. + :param gene_proportion_cutoff: The minimum proportion of shared genes between two features for their edge to be considered significant. + """ + # compute jaccard on organism and on genes for f1, f2, data in context_graph.edges(data=True): data['jaccard_organism'] = len(data['organisms'])/len(f1.organisms | f2.organisms) - data['min_jaccard_organism'] = len(data['organisms'])/min(len(f1.organisms), len(f2.organisms)) - data['max_jaccard_organism'] = len(data['organisms'])/max(len(f1.organisms), len(f2.organisms)) - - - # print("f1", f1) - # print("len data[f1]", len(data[f1])) - - # print('len(f1.genes)', len(f1.genes)) - - f1_gene_proportion = len(data[f1])/len(f1.genes) - f2_gene_proportion = len(data[f2])/len(f2.genes) + f1_gene_proportion = len(data['genes'][f1])/len(f1.genes) + f2_gene_proportion = len(data['genes'][f2])/len(f2.genes) data[f'f1'] = f1.name data[f'f2'] = f2.name - data[f'f1_gene_proportion'] = f1_gene_proportion - data[f'f2_gene_proportion'] = f2_gene_proportion + data[f'f1_jaccard_gene'] = f1_gene_proportion + data[f'f2_jaccard_gene'] = f2_gene_proportion - data[f'is_gene_proportion_>_{gene_proportion_cutoff}'] = (f1_gene_proportion >= gene_proportion_cutoff) and (f2_gene_proportion >= gene_proportion_cutoff) + data[f'is_jaccard_gene_>_{gene_proportion_cutoff}'] = (f1_gene_proportion >= gene_proportion_cutoff) and (f2_gene_proportion >= gene_proportion_cutoff) - # for k, v in data.items(): - # if type(v) != set: - # print(k, v) - # print('===') - + # the following commented out lines are additional metrics that could be used + + # data['min_jaccard_organism'] = len(data['organisms'])/min(len(f1.organisms), len(f2.organisms)) + # data['max_jaccard_organism'] = len(data['organisms'])/max(len(f1.organisms), len(f2.organisms)) + # f1_gene_proportion_partial = len(data['genes'][f1])/len(context_graph.nodes[f1]['genes']) + # f2_gene_proportion_partial = len(data['genes'][f2])/len(context_graph.nodes[f2]['genes']) + # data[f'f1_jaccard_gene_partital'] = f1_gene_proportion_partial + # data[f'f2_jaccard_gene_partital'] = f2_gene_proportion_partial def add_edges_to_context_graph(context_graph: nx.Graph, @@ -185,7 +268,7 @@ def add_edges_to_context_graph(context_graph: nx.Graph, contig_size=len(contig_genes), is_circular=is_circular) next_genes = list(next_genes) - for next_gene_index in next_genes: + for i, next_gene_index in enumerate(next_genes): # Check if the next gene is within the contig windows if not any(lower <= next_gene_index <= upper for (lower, upper) in contig_windows): # next_gene_index is not in any range of genes in the context @@ -200,39 +283,60 @@ def add_edges_to_context_graph(context_graph: nx.Graph, context_graph.add_edge(gene.family, next_gene.family) + if i == 0: + context_graph[gene.family][next_gene.family]['adjacent_family'] = True + + + # Add node attributes + node_gene_dict = context_graph.nodes[gene.family] + next_gene_gene_dict = context_graph.nodes[next_gene.family] + + increment_attribute_counter(node_gene_dict, "genes_count") + increment_attribute_counter(next_gene_gene_dict, "genes_count") + + add_val_to_dict_attribute(node_gene_dict, "genes", gene) + add_val_to_dict_attribute(next_gene_gene_dict, "genes", next_gene) + + # Add edge attributes edge_dict = context_graph[gene.family][next_gene.family] - add_val_to_edge_attribute(edge_dict, gene.family, gene) - add_val_to_edge_attribute(edge_dict, next_gene.family, next_gene) + try: + genes_edge_dict = edge_dict['genes'] + except: + genes_edge_dict = {} + edge_dict['genes'] = genes_edge_dict + + add_val_to_dict_attribute(genes_edge_dict, gene.family, gene) + add_val_to_dict_attribute(genes_edge_dict, next_gene.family, next_gene) - add_val_to_edge_attribute(edge_dict, "organisms", gene.organism) + add_val_to_dict_attribute(edge_dict, "organisms", gene.organism) - update_edge_attribute_counter(edge_dict, "gene_pairs") + increment_attribute_counter(edge_dict, "gene_pairs") - assert gene.organism == next_gene.organism + assert gene.organism == next_gene.organism -def add_val_to_edge_attribute(edge_dict: dict, attribute_key, attribute_value): +def add_val_to_dict_attribute(attr_dict: dict, attribute_key, attribute_value): """ - Add an edge attribute value to the edge dictionary set. + Add an attribute value to a edge or node dictionary set. - :param edge_dict: The dictionary containing the edge attributes. + :param attr_dict: The dictionary containing the edge/node attributes. :param attribute_key: The key of the attribute. :param attribute_value: The value of the attribute to be added. """ try: - edge_dict[attribute_key].add(attribute_value) + attr_dict[attribute_key].add(attribute_value) except KeyError: - edge_dict[attribute_key] = {attribute_value} + attr_dict[attribute_key] = {attribute_value} -def update_edge_attribute_counter(edge_dict: dict, key:Hashable): +def increment_attribute_counter(edge_dict: dict, key:Hashable): """ - Update the counter for an edge attribute in the edge dictionary. + Increment the counter for an edge/node attribute in the edge/node dictionary. - :param edge_dict: The dictionary containing the edge attributes. + :param edge_dict: The dictionary containing the attributes. :param key: The key of the attribute. """ @@ -361,7 +465,6 @@ def compute_gene_context_graph(families: Iterable[GeneFamily], transitive: int = contig_to_genes_of_interest = get_contig_to_genes(families) for contig, genes_of_interest in tqdm(contig_to_genes_of_interest.items(), unit="contig", total=len(contig_to_genes_of_interest), disable=disable_bar): - logging.debug(f'Processing {len(genes_of_interest)} genes of interest in contig {contig}') genes_count = len(contig.genes) @@ -378,22 +481,22 @@ def compute_gene_context_graph(families: Iterable[GeneFamily], transitive: int = return context_graph -def compute_gene_context(g: nx.Graph, jaccard: float = 0.85) -> set: - """ - Compute the gene contexts in the graph +# def compute_gene_context(g: nx.Graph, jaccard: float = 0.85) -> set: +# """ +# Compute the gene contexts in the graph - :param g: Graph of gene contexts between interesting gene families of the pan - :param jaccard: Jaccard index +# :param g: Graph of gene contexts between interesting gene families of the pan +# :param jaccard: Jaccard index - :return: Set of gene contexts find in graph - """ +# :return: Set of gene contexts find in graph +# """ - gene_contexts = set() - c = 1 - for comp in connected_components(g, removed=set(), weight=jaccard): - gene_contexts.add(GeneContext(gc_id=c, families=comp)) - c += 1 - return gene_contexts +# gene_contexts = set() +# c = 1 +# for comp in connected_components(g, removed=set(), weight=jaccard): +# gene_contexts.add(GeneContext(gc_id=c, families=comp)) +# c += 1 +# return gene_contexts def fam2seq(seq_to_pan: dict) -> dict: @@ -415,34 +518,33 @@ def fam2seq(seq_to_pan: dict) -> dict: return fam_2_seq -def export_to_dataframe(families: set, gene_contexts: set, fam_to_seq: dict, output: str): +def export_context_to_dataframe(gene_contexts: set, fam_to_seq: dict, output: str): """ Export the results into dataFrame - :param families: Families related to the connected components :param gene_contexts: connected components found in the pan :param fam_to_seq: Dictionary with gene families as keys and list of sequence ids as values :param output: output path """ - logging.getLogger().debug(f"There are {len(families)} families among {len(gene_contexts)} gene contexts") - lines = [] for gene_context in gene_contexts: for family in gene_context.families: - line = [gene_context.ID] - if fam_to_seq is None or fam_to_seq.get(family.ID) is None: - line += [family.name, None, len(family.organisms), family.named_partition] - else: - line += [family.name, ','.join(fam_to_seq.get(family.ID)), - len(family.organisms), family.named_partition] - lines.append(line) - df = pd.DataFrame(lines, - columns=["GeneContext ID", "Gene family name", "Sequence ID", "Nb Genomes", "Partition"] - ).set_index("GeneContext ID") - df.sort_values(["GeneContext ID", "Sequence ID"], na_position='last').to_csv( - path_or_buf=f"{output}/gene_contexts.tsv", sep="\t", na_rep='NA') - logging.getLogger(f"detected gene context(s) are listed in: '{output}/gene_contexts.tsv'") + + family_info = {"GeneContext ID":gene_context.ID, + "Gene family name": family.name, + "Sequence ID":None if fam_to_seq is None else ','.join(fam_to_seq.get(family.ID)), + "Nb Genomes":len(family.organisms), + "Partition": family.named_partition } + lines.append(family_info) + + df = pd.DataFrame(lines).set_index("GeneContext ID") + + df = df.sort_values(["GeneContext ID", "Sequence ID"], na_position='last') + + df.to_csv(output, sep="\t", na_rep='NA') + + logging.getLogger().debug(f"detected gene context(s) are listed in: '{output}") def launch(args: argparse.Namespace): @@ -460,7 +562,7 @@ def launch(args: argparse.Namespace): search_gene_context_in_pangenome(pangenome=pangenome, output=args.output, tmpdir=args.tmpdir, sequences=args.sequences, families=args.family, transitive=args.transitive, identity=args.identity, coverage=args.coverage, jaccard_threshold=args.jaccard, window_size=args.window_size, - no_defrag=args.no_defrag, cpu=args.cpu, disable_bar=args.disable_prog_bar) + no_defrag=args.no_defrag, cpu=args.cpu, write_context_graph=args.write_graph, disable_bar=args.disable_prog_bar) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -514,7 +616,8 @@ def parser_context(parser: argparse.ArgumentParser): optional.add_argument("-s", "--jaccard", required=False, type=restricted_float, default=0.85, help="minimum jaccard similarity used to filter edges between gene families. Increasing it " "will improve precision but lower sensitivity a lot.") - + optional.add_argument('--write_graph', action="store_true", + help="Write context graph in GEXF format.") if __name__ == '__main__': """To test local change and allow using debugger""" From ababedc6c2e50813b72067c057fc7dbf8cefa3a1 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 23 May 2023 13:48:52 +0200 Subject: [PATCH 12/25] fix bug in writting context table with seq input --- ppanggolin/context/searchGeneContext.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 35470750..551b3ed9 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -531,9 +531,14 @@ def export_context_to_dataframe(gene_contexts: set, fam_to_seq: dict, output: st for gene_context in gene_contexts: for family in gene_context.families: + if fam_to_seq is None or fam_to_seq.get(family.ID) is None: + sequence_id = None + else: + sequence_id = ','.join(fam_to_seq.get(family.ID)) + family_info = {"GeneContext ID":gene_context.ID, "Gene family name": family.name, - "Sequence ID":None if fam_to_seq is None else ','.join(fam_to_seq.get(family.ID)), + "Sequence ID":sequence_id, "Nb Genomes":len(family.organisms), "Partition": family.named_partition } lines.append(family_info) @@ -544,7 +549,7 @@ def export_context_to_dataframe(gene_contexts: set, fam_to_seq: dict, output: st df.to_csv(output, sep="\t", na_rep='NA') - logging.getLogger().debug(f"detected gene context(s) are listed in: '{output}") + logging.getLogger().debug(f"detected gene context(s) are listed in: '{output}'") def launch(args: argparse.Namespace): From 55124ee76e82e93929ca5ffed2fd428fa21db873 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 8 Jun 2023 14:24:05 +0200 Subject: [PATCH 13/25] export filttered graph with edges jaccard > cutoff --- ppanggolin/context/searchGeneContext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 551b3ed9..cbde1a20 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -116,7 +116,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: families_in_contexts = {family for gene_context in gene_contexts for family in gene_context.families} - graph_with_final_contexts = nx.subgraph_view(gene_context_graph, filter_node=lambda n: n in families_in_contexts) + graph_with_final_contexts = nx.subgraph_view(filtered_graph, filter_node=lambda n: n in families_in_contexts) if write_context_graph: write_graph(graph_with_final_contexts, output, gene_families, gene_contexts) From 5f3f62966fcc460f4a2b99b5fc8bbc84ba3ed161 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 14 Jun 2023 11:31:01 +0200 Subject: [PATCH 14/25] load less thing when just writing family prot seqs --- ppanggolin/formats/writeSequences.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 4f9a78b7..e759cf88 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -347,14 +347,18 @@ def write_sequence_files(pangenome: Pangenome, output: str, fasta: str = None, a need_regions = False need_modules = False - if any(x is not None for x in [regions, genes, gene_families, prot_families]): + if prot_families is not None: + need_families = True + + if any(x is not None for x in [regions, genes, gene_families]): need_annotations = True need_families = True if regions is not None or any(x == "rgp" for x in (genes, gene_families, prot_families)): + need_annotations = True need_regions = True if any(x in ["persistent", "shell", "cloud"] for x in (genes, gene_families, prot_families)): need_partitions = True - for x in (genes, gene_families, prot_families): + for x in (genes, gene_families): if x is not None and 'module_' in x: need_modules = True From fac6a006c2b5e6644ec3c0ad1acdbd21a7b9a799 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 20 Jun 2023 14:51:49 +0200 Subject: [PATCH 15/25] Add transitivity in edge attributes --- ppanggolin/context/searchGeneContext.py | 26 ++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 5901acb3..48e4811e 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -182,11 +182,13 @@ def filter_attribute(data:dict): G = nx.Graph() - G.add_edges_from(((f1.name, f2.name) for f1,f2 in context_graph.edges())) + G.add_edges_from((f1.name, f2.name, filter_attribute(d)) for f1, f2, d in context_graph.edges(data=True)) + - edges_with_attributes = {(f1.name, f2.name):filter_attribute(d) for f1,f2,d in context_graph.edges(data=True)} + # convert transitivity dict to str + edges_with_transitivity_str = {(f1.name, f2.name):str(d['transitivity']) for f1, f2, d in context_graph.edges(data=True)} - nx.set_edge_attributes(G, edges_with_attributes) + nx.set_edge_attributes(G, edges_with_transitivity_str, name="transitivity") nodes_attributes_filtered = {f.name:filter_attribute(d) for f,d in context_graph.nodes(data=True)} @@ -235,6 +237,12 @@ def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff: float) data[f'f2_jaccard_gene'] = f2_gene_proportion data[f'is_jaccard_gene_>_{gene_proportion_cutoff}'] = (f1_gene_proportion >= gene_proportion_cutoff) and (f2_gene_proportion >= gene_proportion_cutoff) + + transitivity_counter = data['transitivity'] + + mean_transitivity = sum((transitivity*counter for transitivity, counter in transitivity_counter.items()))/sum((counter for counter in transitivity_counter.values())) + + data['mean_transitivity'] = mean_transitivity # the following commented out lines are additional metrics that could be used @@ -283,10 +291,17 @@ def add_edges_to_context_graph(context_graph: nx.Graph, context_graph.add_edge(gene.family, next_gene.family) - if i == 0: - context_graph[gene.family][next_gene.family]['adjacent_family'] = True + edge_dict = context_graph[gene.family][next_gene.family] + if i == 0: + edge_dict['adjacent_family'] = True + + # Store information of the transitivity used to link the two genes: + if "transitivity" not in edge_dict: + edge_dict['transitivity'] = {i:0 for i in range(t +1)} + edge_dict['transitivity'][i] += 1 + # Add node attributes node_gene_dict = context_graph.nodes[gene.family] next_gene_gene_dict = context_graph.nodes[next_gene.family] @@ -375,6 +390,7 @@ def get_n_next_genes_index(current_index: int, next_genes_count: int, contig_siz if i == next_genes_count: break yield next_gene_index + def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int], window_size: int, is_circular:bool = False): """ From d9f8a9c5de332b2e3a297a1fba2ddd0ae3a4eb80 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 20 Jun 2023 14:55:51 +0200 Subject: [PATCH 16/25] add metadata subparser in __init__ --- ppanggolin/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ppanggolin/__init__.py b/ppanggolin/__init__.py index b797d552..57657904 100755 --- a/ppanggolin/__init__.py +++ b/ppanggolin/__init__.py @@ -11,6 +11,7 @@ import ppanggolin.mod import ppanggolin.context import ppanggolin.workflow +import ppanggolin.meta # import ppanggolin.utility @@ -33,5 +34,6 @@ "rgp":ppanggolin.RGP.genomicIsland.subparser, "spot":ppanggolin.RGP.spot.subparser, "module":ppanggolin.mod.subparser, - "context":ppanggolin.context.subparser,# "info":ppanggolin.info.subparser, "default_config":ppanggolin.utility.default_config.subparser + "context":ppanggolin.context.subparser, + "metadata":ppanggolin.meta.subparser # "info":ppanggolin.info.subparser, "default_config":ppanggolin.utility.default_config.subparser } From 05127ded5ce1ba76e6d0432c80fce33343464a67 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 21 Jun 2023 11:09:00 +0200 Subject: [PATCH 17/25] add graph in gene context object and return them This useful for panorama --- ppanggolin/context/searchGeneContext.py | 133 ++++++++++++++---------- ppanggolin/region.py | 45 ++++++-- 2 files changed, 113 insertions(+), 65 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 48e4811e..515a9aeb 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -56,7 +56,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) - gene_families = set() + families_of_interest = set() fam_2_seq = None if sequences is not None: @@ -69,14 +69,14 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: new_tmpdir.cleanup() for pan_family in seq2pan.values(): - gene_families.add(pan_family) + families_of_interest.add(pan_family) fam_2_seq = fam2seq(seq2pan) if families is not None: with open(families, 'r') as f: for fam_name in f.read().splitlines(): - gene_families.add(pangenome.get_gene_family(fam_name)) + families_of_interest.add(pangenome.get_gene_family(fam_name)) # half_window = round((window_size-1)/2) # logging.info(f'Window size of {half_window*2 + 1}. Gene context will include {half_window} genes on each side of the target gene.') @@ -86,7 +86,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: logging.getLogger().info("Building the graph...") - gene_context_graph = compute_gene_context_graph(families=gene_families, transitive=transitive, window_size=window_size, disable_bar=disable_bar) + gene_context_graph = compute_gene_context_graph(families=families_of_interest, transitive=transitive, window_size=window_size, disable_bar=disable_bar) logging.getLogger().info( f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find common gene contexts") @@ -102,27 +102,14 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: logging.getLogger().debug(f"Filtering context graph on {filter_flag}") logging.getLogger().debug(f"Context graph made of {nx.number_of_nodes(filtered_graph)} nodes and {nx.number_of_edges(filtered_graph)} edges") - connected_components = nx.connected_components(filtered_graph) - # Connected component graph Filtering - - # remove singleton famillies - connected_components = (component for component in connected_components if len(component) > 1) - - # remove component made only of famillies not initially requested - connected_components = (component for component in connected_components if component & gene_families) - - gene_contexts = {GeneContext(gc_id=i, families=component) for i, component in enumerate(connected_components) } - - families_in_contexts = {family for gene_context in gene_contexts for family in gene_context.families} - - graph_with_final_contexts = nx.subgraph_view(filtered_graph, filter_node=lambda n: n in families_in_contexts) + gene_contexts = get_gene_contexts(filtered_graph, families_of_interest) if write_context_graph: - write_graph(graph_with_final_contexts, output, gene_families, gene_contexts) + write_graph(gene_contexts, output, families_of_interest, graph_format=['graphml', "gexf"]) - if len(families_in_contexts) != 0: - logging.getLogger().debug(f"There are {len(families_in_contexts)} families among {len(gene_contexts)} gene contexts") + if len(gene_contexts) != 0: + logging.getLogger().debug(f"There are {sum((len(gc) for gc in gene_contexts))} families among {len(gene_contexts)} gene contexts") output_file = os.path.join(output, "gene_contexts.tsv") @@ -133,7 +120,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: logging.getLogger().info(f"Computing gene contexts took {round(time.time() - start_time, 2)} seconds") - + return gene_contexts # # Finding connected components with panmodule functions # # extract the modules from the graph @@ -155,20 +142,60 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: # export_context_to_dataframe(gene_contexts_pandmodule_way, fam_2_seq, output_file) +def get_gene_contexts(context_graph: nx.Graph, families_of_interest: Set[GeneFamily]) -> Set[GeneContext]: + """ + Extract gene contexts from a context graph based on the provided set of gene families of interest. + + Gene contexts are extracted from a context graph by identifying connected components. + The function filters the connected components based on the following criteria: + - Remove singleton families (components with only one gene family). + - Remove components that do not contain any gene families of interest. + + For each remaining connected component, a GeneContext object is created. + + :param context_graph: The context graph from which to extract gene contexts. + :param families_of_interest: Set of gene families of interest. + :return: Set of GeneContext objects representing the extracted gene contexts. + """ + + connected_components = nx.connected_components(context_graph) + + # Connected component graph Filtering + + # remove singleton famillies + connected_components = (component for component in connected_components if len(component) > 1) + + # remove component made only of famillies not initially requested + connected_components = (component for component in connected_components if component & families_of_interest) + + gene_contexts = set() + for i, component in enumerate(connected_components): + + family_of_interest_of_gc = component & families_of_interest + gene_context = GeneContext(gc_id=i, families=component, families_of_interest=family_of_interest_of_gc) + + graph_of_gc = nx.subgraph_view(context_graph, filter_node=lambda n: n in component) # .copy() + nx.set_node_attributes(graph_of_gc, i, name="gene_context_id") + gene_context.add_context_graph(graph_of_gc) + + gene_contexts.add(gene_context) + + return gene_contexts -def write_graph(context_graph: nx.Graph, output_dir: str, famillies_of_interest: Set[GeneFamily], gene_contexts:List[GeneContext]): + +def write_graph(gene_contexts:List[GeneContext], output_dir: str, families_of_interest: Set[GeneFamily], graph_format:List[str]): """ Write a graph to file with node and edge attributes. - This function writes a graph to a file in the GraphML format or in GEXF format. The original context graph contains + This function writes a graph to a file in the GraphML format or/and in GEXF format. The original context graph contains ppanggolin objects as nodes and lists and dictionaries in edge attributes. Since these objects cannot be written to the output graph, this function creates a new graph that contains only writable objects. - :param context_graph: A NetworkX Graph object representing the graph. + :param gene_contexts: List of gene context. it includes graph of the context :param output_dir: The output directory where the graph file will be written. - :param famillies_of_interest: A list of node objects that are of interest. - :param gene_contexts: List of gene context, used to add context id to node of the graph + :param families_of_interest: A list of node objects that are of interest. + :param graph_format: List of formats of the output graph. Can be graphml or gexf """ def filter_attribute(data:dict): @@ -182,38 +209,36 @@ def filter_attribute(data:dict): G = nx.Graph() - G.add_edges_from((f1.name, f2.name, filter_attribute(d)) for f1, f2, d in context_graph.edges(data=True)) + for gc in gene_contexts: + G.add_edges_from((f1.name, f2.name, filter_attribute(d)) for f1, f2, d in gc.graph.edges(data=True)) - # convert transitivity dict to str - edges_with_transitivity_str = {(f1.name, f2.name):str(d['transitivity']) for f1, f2, d in context_graph.edges(data=True)} - - nx.set_edge_attributes(G, edges_with_transitivity_str, name="transitivity") + # convert transitivity dict to str + edges_with_transitivity_str = {(f1.name, f2.name):str(d['transitivity']) for f1, f2, d in gc.graph.edges(data=True)} - nodes_attributes_filtered = {f.name:filter_attribute(d) for f,d in context_graph.nodes(data=True)} - - # on top of attributes already contained in node of context graph - # add organisms and genes count that have the family, the partition and if the family was in initially requested - nodes_family_data = {f.name:{"organisms":len(f.organisms), - "partition":f.partition, - "genes":len(f.genes), - "famillies_of_interest": f in famillies_of_interest} for f in context_graph.nodes()} + nx.set_edge_attributes(G, edges_with_transitivity_str, name="transitivity") - family_name_to_context_id = {family.name:context.ID for context in gene_contexts for family in context.families} - - for f, d in G.nodes(data=True): - d.update(nodes_family_data[f]) - d.update(nodes_attributes_filtered[f]) - d['context_id'] = family_name_to_context_id[f] + nodes_attributes_filtered = {f.name:filter_attribute(d) for f,d in gc.graph.nodes(data=True)} + + # on top of attributes already contained in node of context graph + # add organisms and genes count that have the family, the partition and if the family was in initially requested + nodes_family_data = {f.name:{"organisms": len(f.organisms), + "partition": f.named_partition, + "genes": len(f.genes), + "families_of_interest": f in families_of_interest} for f in gc.graph.nodes()} + for f, d in G.nodes(data=True): + d.update(nodes_family_data[f]) + d.update(nodes_attributes_filtered[f]) - - graphml_file = os.path.join(output_dir, "graph_context.graphml") - logging.info(f'Writting context graph in {graphml_file}') - nx.write_graphml_lxml(G, graphml_file) + if "graphml" in graph_format: + graphml_file = os.path.join(output_dir, "graph_context.graphml") + logging.info(f'Writting context graph in {graphml_file}') + nx.write_graphml_lxml(G, graphml_file) - gexf_file = os.path.join(output_dir, "graph_context.gexf") - logging.info(f'Writting context graph in {gexf_file}') - nx.readwrite.gexf.write_gexf(G, gexf_file) + if "gexf" in graph_format: + gexf_file = os.path.join(output_dir, "graph_context.gexf") + logging.info(f'Writting context graph in {gexf_file}') + nx.readwrite.gexf.write_gexf(G, gexf_file) def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff: float) -> None: @@ -610,7 +635,7 @@ def parser_context(parser: argparse.ArgumentParser): required = parser.add_argument_group(title="Required arguments", description="All of the following arguments are required :") required.add_argument('-p', '--pangenome', required=False, type=str, help="The pangenome.h5 file") - required.add_argument('-o', '--output', required=True, type=str, + required.add_argument('-o', '--output', required=False, type=str, help="Output directory where the file(s) will be written") onereq = parser.add_argument_group(title="Input file", description="One of the following argument is required :") onereq.add_argument('-S', '--sequences', required=False, type=str, diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 7209091a..f7f40c7f 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -7,8 +7,8 @@ from collections.abc import Iterable # installed libraries -from typing import Dict - +from typing import Dict, Set +import networkx as nx import gmpy2 # local libraries @@ -431,26 +431,49 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): class GeneContext: """ - A class used to represent a gene context + A class used to represent a gene context. + + :param gc_id: Identifier of the gene context. + :param families: Gene families related to the gene context. + :param families_of_interest: Input families for which the context is being searched. - :param gc_id : identifier of the Gene context - :param families: Gene families related to the GeneContext + Gene contexts are used to represent a set of gene families and their relationships. """ - def __init__(self, gc_id: int, families: set = None): + def __init__(self, gc_id: int, families: Set[GeneFamily] = None, families_of_interest: Set[GeneFamily] = None): self.ID = gc_id self.families = set() + self.families_of_interest = families_of_interest + self.graph = None if families is not None: if not all(isinstance(fam, GeneFamily) for fam in families): - raise Exception("You provided elements that were not GeneFamily object." - " GeneContext are only made of GeneFamily") + raise Exception("You provided elements that were not GeneFamily objects. " + "GeneContexts are only made of GeneFamily objects.") self.families |= set(families) def add_family(self, family: GeneFamily): """ - Allow to add one family in the GeneContext - :param family: family to add + Add a gene family to the gene context. + + :param family: The gene family to add. """ if not isinstance(family, GeneFamily): - raise Exception("You did not provide a GenFamily object. Modules are only made of GeneFamily") + raise Exception("You did not provide a GeneFamily object. " + "GeneContexts are only made of GeneFamily objects.") self.families.add(family) + + def __len__(self) -> int: + """ + Get length of a context graph by returning the number of gene families it includes. + + :return: number of family in the gene context + """ + return len(self.families) + + def add_context_graph(self, graph: nx.Graph): + """ + Add a context graph to the gene context. + + :param graph: The context graph. + """ + self.graph = graph \ No newline at end of file From 690317e2afa187ca5c2b8c64bd7da4d752b6f27b Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 22 Jun 2023 16:34:44 +0200 Subject: [PATCH 18/25] change graph writing to make panorama interfacing easier --- ppanggolin/context/searchGeneContext.py | 118 +++++++++++++++--------- 1 file changed, 73 insertions(+), 45 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 515a9aeb..95ef6675 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -49,13 +49,6 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: :param disable_bar: Allow preventing bar progress print """ - # check statuses and load info - if sequences is not None and pangenome.status["geneFamilySequences"] not in ["inFile", "Loaded", "Computed"]: - raise Exception("Cannot use this function as your pangenome does not have gene families representatives " - "associated to it. For now this works only if the clustering has been made by PPanGGOLiN.") - - check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) - families_of_interest = set() fam_2_seq = None @@ -98,15 +91,20 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: # Filter graph filter_flag = f'is_jaccard_gene_>_{jaccard_threshold}' - filtered_graph = nx.subgraph_view(gene_context_graph, filter_edge=lambda n1, n2: gene_context_graph[n1][n2][filter_flag] ) + edges_to_remove = [(n,v) for n,v,d in gene_context_graph.edges(data=True) if not d[filter_flag]] + gene_context_graph.remove_edges_from(edges_to_remove) + + # filtered_context_graph = nx.subgraph_view(gene_context_graph, filter_edge=lambda n1, n2: gene_context_graph[n1][n2][filter_flag] ) logging.getLogger().debug(f"Filtering context graph on {filter_flag}") - logging.getLogger().debug(f"Context graph made of {nx.number_of_nodes(filtered_graph)} nodes and {nx.number_of_edges(filtered_graph)} edges") + logging.getLogger().debug(f"Context graph made of {nx.number_of_nodes(gene_context_graph)} nodes and {nx.number_of_edges(gene_context_graph)} edges") - gene_contexts = get_gene_contexts(filtered_graph, families_of_interest) + gene_contexts = get_gene_contexts(gene_context_graph, families_of_interest) - if write_context_graph: - write_graph(gene_contexts, output, families_of_interest, graph_format=['graphml', "gexf"]) + + gene_context_graph = make_graph_writable(gene_context_graph) + print(f"WRITING gene context graph in {output}") + write_graph(gene_context_graph, output, graph_format=['graphml', "gexf"]) if len(gene_contexts) != 0: logging.getLogger().debug(f"There are {sum((len(gc) for gc in gene_contexts))} families among {len(gene_contexts)} gene contexts") @@ -120,7 +118,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: logging.getLogger().info(f"Computing gene contexts took {round(time.time() - start_time, 2)} seconds") - return gene_contexts + return gene_context_graph # # Finding connected components with panmodule functions # # extract the modules from the graph @@ -169,35 +167,46 @@ def get_gene_contexts(context_graph: nx.Graph, families_of_interest: Set[GeneFam connected_components = (component for component in connected_components if component & families_of_interest) gene_contexts = set() + families_in_context = set() + for i, component in enumerate(connected_components): - + families_in_context |= component family_of_interest_of_gc = component & families_of_interest gene_context = GeneContext(gc_id=i, families=component, families_of_interest=family_of_interest_of_gc) - - graph_of_gc = nx.subgraph_view(context_graph, filter_node=lambda n: n in component) # .copy() - nx.set_node_attributes(graph_of_gc, i, name="gene_context_id") - gene_context.add_context_graph(graph_of_gc) + + # nx.set_node_attributes(context_graph, i, name="gene_context_id") + # add gc id to node attribute + node_attributes = {n:{"gene_context_id":i, "families_of_interest": n in families_of_interest} for n in component} + nx.set_node_attributes(context_graph, node_attributes) + + # for n, attribute in context_graph.nodes(data=True): + # if n in component: + # attribute['gene_context_id'] = i + # attribute['families_of_interest'] = n in families_of_interest + # print(context_graph.get_node_attributes(n, 'families_of_interest') ) + # # gene_context.add_context_graph(graph_of_gc) gene_contexts.add(gene_context) + + node_not_in_context = set(context_graph.nodes()) - families_in_context + context_graph.remove_nodes_from(node_not_in_context) return gene_contexts -def write_graph(gene_contexts:List[GeneContext], output_dir: str, families_of_interest: Set[GeneFamily], graph_format:List[str]): +def make_graph_writable(context_graph): + """ - Write a graph to file with node and edge attributes. - - This function writes a graph to a file in the GraphML format or/and in GEXF format. The original context graph contains + + The original context graph contains ppanggolin objects as nodes and lists and dictionaries in edge attributes. Since these objects cannot be written to the output graph, this function creates a new graph that contains only writable objects. :param gene_contexts: List of gene context. it includes graph of the context - :param output_dir: The output directory where the graph file will be written. - :param families_of_interest: A list of node objects that are of interest. - :param graph_format: List of formats of the output graph. Can be graphml or gexf """ + def filter_attribute(data:dict): """ Helper function to filter the edge attributes. @@ -206,30 +215,40 @@ def filter_attribute(data:dict): :return: A filtered dictionary containing only non-collection attributes. """ return {k:v for k, v in data.items() if type(v) not in [set, dict, list]} - + G = nx.Graph() - for gc in gene_contexts: - G.add_edges_from((f1.name, f2.name, filter_attribute(d)) for f1, f2, d in gc.graph.edges(data=True)) - + G.add_edges_from((f1.name, f2.name, filter_attribute(d)) for f1, f2, d in context_graph.edges(data=True)) - # convert transitivity dict to str - edges_with_transitivity_str = {(f1.name, f2.name):str(d['transitivity']) for f1, f2, d in gc.graph.edges(data=True)} - - nx.set_edge_attributes(G, edges_with_transitivity_str, name="transitivity") + + # convert transitivity dict to str + edges_with_transitivity_str = {(f1.name, f2.name):str(d['transitivity']) for f1, f2, d in context_graph.edges(data=True)} + + nx.set_edge_attributes(G, edges_with_transitivity_str, name="transitivity") + + nodes_attributes_filtered = {f.name:filter_attribute(d) for f,d in context_graph.nodes(data=True)} + + # on top of attributes already contained in node of context graph + # add organisms and genes count that have the family, the partition and if the family was in initially requested + nodes_family_data = {f.name:{"organisms": len(f.organisms), + "partition": f.named_partition, + "genes": len(f.genes)} for f in context_graph.nodes()} - nodes_attributes_filtered = {f.name:filter_attribute(d) for f,d in gc.graph.nodes(data=True)} - - # on top of attributes already contained in node of context graph - # add organisms and genes count that have the family, the partition and if the family was in initially requested - nodes_family_data = {f.name:{"organisms": len(f.organisms), - "partition": f.named_partition, - "genes": len(f.genes), - "families_of_interest": f in families_of_interest} for f in gc.graph.nodes()} - for f, d in G.nodes(data=True): - d.update(nodes_family_data[f]) - d.update(nodes_attributes_filtered[f]) - + for f, d in G.nodes(data=True): + d.update(nodes_family_data[f]) + d.update(nodes_attributes_filtered[f]) + + return G + +def write_graph(G:nx.Graph, output_dir: str, graph_format:List[str]): + """ + Write a graph to file in the GraphML format or/and in GEXF format. + + :param output_dir: The output directory where the graph file will be written. + :param graph_format: List of formats of the output graph. Can be graphml or gexf + + """ + if "graphml" in graph_format: graphml_file = os.path.join(output_dir, "graph_context.graphml") logging.info(f'Writting context graph in {graphml_file}') @@ -605,6 +624,15 @@ def launch(args: argparse.Namespace): mk_outdir(args.output, args.force) pangenome = Pangenome() pangenome.add_file(args.pangenome) + + # check statuses and load info + if args.sequences is not None and pangenome.status["geneFamilySequences"] not in ["inFile", "Loaded", "Computed"]: + raise Exception("Cannot use this function as your pangenome does not have gene families representatives " + "associated to it. For now this works only if the clustering has been made by PPanGGOLiN.") + + check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar) + + search_gene_context_in_pangenome(pangenome=pangenome, output=args.output, tmpdir=args.tmpdir, sequences=args.sequences, families=args.family, transitive=args.transitive, identity=args.identity, coverage=args.coverage, jaccard_threshold=args.jaccard, window_size=args.window_size, From c8faaab2c8ee9269948764e919a70ceb1f9cc975 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 23 Jun 2023 11:45:51 +0200 Subject: [PATCH 19/25] make transitivity starts at 0. 0 mean no gap between adj genes. --- ppanggolin/context/searchGeneContext.py | 34 ++++++------------------- tests/context/test_context.py | 10 ++++---- 2 files changed, 13 insertions(+), 31 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 95ef6675..78b16083 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -29,7 +29,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: str, sequences: str = None, families: str = None, transitive: int = 4, identity: float = 0.5, coverage: float = 0.8, jaccard_threshold: float = 0.85, window_size: int = 1, no_defrag: bool = False, - cpu: int = 1, write_context_graph:bool = False, disable_bar=True): + cpu: int = 1, disable_bar=True): """ Main function to search common gene contexts between sequence set and pangenome families @@ -103,7 +103,6 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: gene_context_graph = make_graph_writable(gene_context_graph) - print(f"WRITING gene context graph in {output}") write_graph(gene_context_graph, output, graph_format=['graphml', "gexf"]) if len(gene_contexts) != 0: @@ -301,7 +300,7 @@ def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff: float) def add_edges_to_context_graph(context_graph: nx.Graph, contig_genes: Iterable[Gene], contig_windows: List[Tuple[int, int]], - t: int, + transitivity: int, is_circular: bool): """ Add edges to the context graph based on contig genes and windows. @@ -309,14 +308,14 @@ def add_edges_to_context_graph(context_graph: nx.Graph, :param context_graph: The context graph to which edges will be added. :param contig_genes: An iterable of genes in the contig. :param contig_windows: A list of tuples representing the start and end positions of contig windows. - :param t: The number of next genes to consider when adding edges. + :param transitivity: The number of next genes to consider when adding edges. :param is_circular: A boolean indicating if the contig is circular. """ for window_start, window_end in contig_windows: for gene_index in range(window_start, window_end + 1): gene = contig_genes[gene_index] - next_genes = get_n_next_genes_index(gene_index, next_genes_count=t, + next_genes = get_n_next_genes_index(gene_index, next_genes_count=transitivity+1, contig_size=len(contig_genes), is_circular=is_circular) next_genes = list(next_genes) @@ -342,7 +341,7 @@ def add_edges_to_context_graph(context_graph: nx.Graph, # Store information of the transitivity used to link the two genes: if "transitivity" not in edge_dict: - edge_dict['transitivity'] = {i:0 for i in range(t +1)} + edge_dict['transitivity'] = {i:0 for i in range(transitivity +1)} edge_dict['transitivity'][i] += 1 @@ -541,23 +540,6 @@ def compute_gene_context_graph(families: Iterable[GeneFamily], transitive: int = return context_graph -# def compute_gene_context(g: nx.Graph, jaccard: float = 0.85) -> set: -# """ -# Compute the gene contexts in the graph - -# :param g: Graph of gene contexts between interesting gene families of the pan -# :param jaccard: Jaccard index - -# :return: Set of gene contexts find in graph -# """ - -# gene_contexts = set() -# c = 1 -# for comp in connected_components(g, removed=set(), weight=jaccard): -# gene_contexts.add(GeneContext(gc_id=c, families=comp)) -# c += 1 -# return gene_contexts - def fam2seq(seq_to_pan: dict) -> dict: """ @@ -636,7 +618,7 @@ def launch(args: argparse.Namespace): search_gene_context_in_pangenome(pangenome=pangenome, output=args.output, tmpdir=args.tmpdir, sequences=args.sequences, families=args.family, transitive=args.transitive, identity=args.identity, coverage=args.coverage, jaccard_threshold=args.jaccard, window_size=args.window_size, - no_defrag=args.no_defrag, cpu=args.cpu, write_context_graph=args.write_graph, disable_bar=args.disable_prog_bar) + no_defrag=args.no_defrag, cpu=args.cpu, disable_bar=args.disable_prog_bar) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -690,8 +672,8 @@ def parser_context(parser: argparse.ArgumentParser): optional.add_argument("-s", "--jaccard", required=False, type=restricted_float, default=0.85, help="minimum jaccard similarity used to filter edges between gene families. Increasing it " "will improve precision but lower sensitivity a lot.") - optional.add_argument('--write_graph', action="store_true", - help="Write context graph in GEXF format.") + # optional.add_argument('--write_graph', action="store_true", + # help="Write context graph in GEXF format.") optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") if __name__ == '__main__': diff --git a/tests/context/test_context.py b/tests/context/test_context.py index 1fd45d62..014ef334 100644 --- a/tests/context/test_context.py +++ b/tests/context/test_context.py @@ -106,7 +106,7 @@ def test_add_edges_to_context_graph(simple_contig): add_edges_to_context_graph(context_graph, contig_genes = simple_contig.genes, contig_windows = [(0,3)], - t=2, + transitivity=1, is_circular=simple_contig.is_circular) nodes = sorted([n.name for n in context_graph.nodes()]) @@ -127,7 +127,7 @@ def test_add_edges_to_context_graph_2(simple_contig): add_edges_to_context_graph(context_graph, contig_genes = simple_contig.genes, contig_windows = [(1,3)], - t=1, + transitivity=0, is_circular=simple_contig.is_circular) nodes = sorted([n.name for n in context_graph.nodes()]) @@ -149,7 +149,7 @@ def test_add_edges_to_context_graph_linear(simple_contig): add_edges_to_context_graph(context_graph, contig_genes = simple_contig.genes, contig_windows = [(4,5), (0,2)], - t=1, + transitivity=0, is_circular=False) nodes = sorted([n.name for n in context_graph.nodes()]) @@ -173,7 +173,7 @@ def test_add_edges_to_context_graph_circular(simple_contig): add_edges_to_context_graph(context_graph, contig_genes = simple_contig.genes, contig_windows = [(4,5), (0,2)], - t=1, + transitivity=0, is_circular=True) nodes = sorted([n.name for n in context_graph.nodes()]) @@ -200,7 +200,7 @@ def test_compute_gene_context_graph(simple_contig): families_of_interest = {f for f in families_in_contigs if f.name in family_names_of_interest } context_graph = compute_gene_context_graph(families_of_interest, - transitive=1, + transitive=0, window_size = 2) nodes = sorted([n.name for n in context_graph.nodes()]) edges = {tuple(sorted([n.name, v.name])) for n, v in context_graph.edges()} From 50aa4d5e0c42e8b4ae0e932594db0a56bb657904 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 26 Jun 2023 17:38:36 +0200 Subject: [PATCH 20/25] change output of the function to fit panorama needs --- ppanggolin/context/searchGeneContext.py | 36 +++++++++++++------------ 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 78b16083..32584328 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -29,7 +29,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: str, sequences: str = None, families: str = None, transitive: int = 4, identity: float = 0.5, coverage: float = 0.8, jaccard_threshold: float = 0.85, window_size: int = 1, no_defrag: bool = False, - cpu: int = 1, disable_bar=True): + cpu: int = 1, graph_format:str = "graphml", disable_bar=True): """ Main function to search common gene contexts between sequence set and pangenome families @@ -45,7 +45,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: :param window_size: Number of genes to consider in the gene context. :param no_defrag: do not use the defrag workflow if true :param cpu: Number of core used to process - :param write_context_graph: Write graph of the contexts + :param graph_format: Write format of the context graph. Can be graphml or gexf :param disable_bar: Allow preventing bar progress print """ @@ -103,7 +103,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: gene_context_graph = make_graph_writable(gene_context_graph) - write_graph(gene_context_graph, output, graph_format=['graphml', "gexf"]) + out_graph_file = write_graph(gene_context_graph, output, graph_format) if len(gene_contexts) != 0: logging.getLogger().debug(f"There are {sum((len(gc) for gc in gene_contexts))} families among {len(gene_contexts)} gene contexts") @@ -117,7 +117,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: logging.getLogger().info(f"Computing gene contexts took {round(time.time() - start_time, 2)} seconds") - return gene_context_graph + return gene_context_graph, out_graph_file # # Finding connected components with panmodule functions # # extract the modules from the graph @@ -239,25 +239,28 @@ def filter_attribute(data:dict): return G -def write_graph(G:nx.Graph, output_dir: str, graph_format:List[str]): +def write_graph(G:nx.Graph, output_dir: str, graph_format:str): """ Write a graph to file in the GraphML format or/and in GEXF format. :param output_dir: The output directory where the graph file will be written. - :param graph_format: List of formats of the output graph. Can be graphml or gexf + :param graph_format: Formats of the output graph. Can be graphml or gexf """ - if "graphml" in graph_format: - graphml_file = os.path.join(output_dir, "graph_context.graphml") - logging.info(f'Writting context graph in {graphml_file}') - nx.write_graphml_lxml(G, graphml_file) + if "graphml" == graph_format: + out_file = os.path.join(output_dir, "graph_context.graphml") + logging.info(f'Writting context graph in {out_file}') + nx.write_graphml_lxml(G, out_file) - if "gexf" in graph_format: - gexf_file = os.path.join(output_dir, "graph_context.gexf") - logging.info(f'Writting context graph in {gexf_file}') - nx.readwrite.gexf.write_gexf(G, gexf_file) + elif "gexf" == graph_format: + out_file = os.path.join(output_dir, "graph_context.gexf") + logging.info(f'Writting context graph in {out_file}') + nx.readwrite.gexf.write_gexf(G, out_file) + else: + raise ValueError(f'The given graph format ({graph_format}) is not correct. it should be "graphml" or gexf') + return out_file def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff: float) -> None: """ @@ -618,7 +621,7 @@ def launch(args: argparse.Namespace): search_gene_context_in_pangenome(pangenome=pangenome, output=args.output, tmpdir=args.tmpdir, sequences=args.sequences, families=args.family, transitive=args.transitive, identity=args.identity, coverage=args.coverage, jaccard_threshold=args.jaccard, window_size=args.window_size, - no_defrag=args.no_defrag, cpu=args.cpu, disable_bar=args.disable_prog_bar) + no_defrag=args.no_defrag, cpu=args.cpu, disable_bar=args.disable_prog_bar, graph_format=args.graph_format) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -672,8 +675,7 @@ def parser_context(parser: argparse.ArgumentParser): optional.add_argument("-s", "--jaccard", required=False, type=restricted_float, default=0.85, help="minimum jaccard similarity used to filter edges between gene families. Increasing it " "will improve precision but lower sensitivity a lot.") - # optional.add_argument('--write_graph', action="store_true", - # help="Write context graph in GEXF format.") + optional.add_argument('--graph_format', help="Format of the context graph. Can be gexf or graphml.", default='graphml', choices=['gexf','graphml']) optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") if __name__ == '__main__': From e6fb61f56123480081b8d12eef18c70c03c18cd4 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Oct 2023 11:24:59 +0200 Subject: [PATCH 21/25] update doc for new context parameters --- docs/user/Genomic-context.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/user/Genomic-context.md b/docs/user/Genomic-context.md index a9f66ef1..e8c98d1a 100644 --- a/docs/user/Genomic-context.md +++ b/docs/user/Genomic-context.md @@ -43,8 +43,10 @@ In **sequence Id**, it is possible to find a NA value. This case, correspond to ## Detailed options | option name | Description | |-----------------------------|---------------------------------------------------------------------------| -| --no_defrag | Do not use the defragmentation step, to align sequences with MMseqs2 | -| --identity | Minimum identity percentage threshold | -| --coverage | Minimum coverage percentage threshold | -| -t, --transitive | Size of the transitive closure used to build the graph. This indicates the number of non-related genes allowed in-between two related genes. Increasing it will improve precision but lower sensitivity a little. | -| -s, --jaccard | Minimum jaccard similarity used to filter edges between gene families. Increasing it will improve precision but lower sensitivity a lot. | \ No newline at end of file +| --fast | Use representative sequences of gene families for input gene alignment. This option is recommended for faster processing but may be less sensitive. By default, all pangenome genes are used for alignment. This argument makes sense only when --sequence is provided. (default: False) | +| --no_defrag | Do not use the defragmentation step, to align sequences with MMseqs2 (default: False) | +| --identity | Minimum identity percentage threshold (default: 0.8)| +| --coverage | Minimum coverage percentage threshold (default: 0.8)| +| -t, --transitive | Size of the transitive closure used to build the graph. This indicates the number of non-related genes allowed in-between two related genes. Increasing it will improve precision but lower sensitivity a little. (default: 4) | +| -s, --jaccard | Minimum jaccard similarity used to filter edges between gene families. Increasing it will improve precision but lower sensitivity a lot. (default: 0.85) | +| -w, --window_size | Number of neighboring genes that are considered on each side of a gene of interest when searching for conserved genomic contexts. (default: 5) | \ No newline at end of file From 9b80ceb325e779dab94aad1d576b6165fbd3840c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 13 Oct 2023 10:11:28 +0200 Subject: [PATCH 22/25] Resolve requested change --- VERSION | 2 +- ppanggolin/align/alignOnPang.py | 171 +++++++------ ppanggolin/context/searchGeneContext.py | 317 ++++++++++++------------ ppanggolin/genome.py | 4 +- ppanggolin/region.py | 40 ++- 5 files changed, 289 insertions(+), 245 deletions(-) diff --git a/VERSION b/VERSION index 70c79534..a11c24f7 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.191 +1.2.192 diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index eed33323..28a6bc83 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -9,10 +9,9 @@ import subprocess import argparse from collections import defaultdict, Counter -from typing import List, Tuple, Set, Dict, IO, Iterator, Iterable +from typing import List, Tuple, Set, Dict, IO, Iterable from pathlib import Path - from tqdm import tqdm # local libraries @@ -29,7 +28,7 @@ def create_mmseqs_db(seq_files: Iterable[Path], tmpdir: Path, basename="sequence """ Create a MMseqs2 sequence database with the given fasta files. - :param seq_file: An iterable of path of FASTA files. + :param seq_files: An iterable of path of FASTA files. :param tmpdir: Path to the temporary directory where the database will be created. :param basename: Prefix for the database file (default: "sequences"). @@ -37,13 +36,14 @@ def create_mmseqs_db(seq_files: Iterable[Path], tmpdir: Path, basename="sequence """ with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False, suffix=".DB", prefix=basename) as seqdb: - cmd = ["mmseqs", "createdb"] + [seq_file.as_posix() for seq_file in seq_files] + [seqdb.name, '--dbtype', '0'] - + cmd = ["mmseqs", "createdb"] + [seq_file.as_posix() for seq_file in seq_files] + [seqdb.name, '--dbtype', '0'] + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL) return Path(seqdb.name) + def translate_with_mmseqs(seqdb: Path, translation_table: int, cpu: int, tmpdir: Path) -> Path: """ Translate nucleotide sequences in an MMseqs2 sequence database to amino acid sequences. @@ -56,21 +56,21 @@ def translate_with_mmseqs(seqdb: Path, translation_table: int, cpu: int, tmpdir: :return: Path to the new MMseqs2 sequence database containing translated amino acid sequences. """ - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False, prefix=seqdb.stem, suffix=".aa.DB") as seqdb_aa: + with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False, prefix=seqdb.stem, + suffix=".aa.DB") as seqdb_aa: + cmd = ["mmseqs", "translatenucs", seqdb.as_posix(), seqdb_aa.name, "--translation-table", + f"{translation_table}", "--threads", str(cpu)] - cmd = ["mmseqs", "translatenucs", seqdb.as_posix(), seqdb_aa.name, "--translation-table", - f"{translation_table}", "--threads", str(cpu)] - logging.getLogger().debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - + return Path(seqdb_aa.name) -def align_seq_to_pang(target_seq_file:Path , query_seq_files: Iterable[Path], +def align_seq_to_pang(target_seq_file: Path, query_seq_files: Iterable[Path], tmpdir: Path, cpu: int = 1, no_defrag: bool = False, - identity: float = 0.8, coverage: float = 0.8, - is_query_nt:bool = False, is_target_nt:bool = False, translation_table: int = None) -> Path: + identity: float = 0.8, coverage: float = 0.8, + is_query_nt: bool = False, is_target_nt: bool = False, translation_table: int = None) -> Path: """ Align fasta sequence to pangenome sequences. @@ -92,36 +92,41 @@ def align_seq_to_pang(target_seq_file:Path , query_seq_files: Iterable[Path], query_db = create_mmseqs_db(query_seq_files, tmpdir, basename="query_sequences") if is_target_nt: - logging.getLogger().debug(f"Target sequences will be translated by mmseqs with translation table {translation_table}") + logging.getLogger().debug( + f"Target sequences will be translated by mmseqs with translation table {translation_table}") target_db = translate_with_mmseqs(target_db, translation_table, cpu, tmpdir) if is_query_nt: - logging.getLogger().debug(f"Query sequences will be translated by mmseqs with translation table {translation_table}") - query_db = translate_with_mmseqs(query_db, translation_table, cpu, tmpdir) + logging.getLogger().debug( + f"Query sequences will be translated by mmseqs with translation table {translation_table}") + query_db = translate_with_mmseqs(query_db, translation_table, cpu, tmpdir) cov_mode = "2" # coverage of query - if no_defrag: - cov_mode = "0" # coverage of query and target - + if no_defrag: + cov_mode = "0" # coverage of query and target + # mmseqs search command # see https://github.com/soedinglab/MMseqs2/issues/373 Using a combination of param to no miss short proteins - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), prefix="aln_result_db_file", suffix=".aln.DB", delete=False) as aln_db: - cmd = ["mmseqs", "search", query_db.as_posix(), target_db.as_posix(), aln_db.name, tmpdir.as_posix(), "-a", "--min-seq-id", str(identity), - "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu), + with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), prefix="aln_result_db_file", suffix=".aln.DB", + delete=False) as aln_db: + cmd = ["mmseqs", "search", query_db.as_posix(), target_db.as_posix(), aln_db.name, tmpdir.as_posix(), "-a", + "--min-seq-id", str(identity), + "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu), "--seed-sub-mat", "VTML40.out", "-s", "2", '--comp-bias-corr', "0", "--mask", "0", "-e", "1"] - logging.getLogger().info("Aligning sequences") logging.getLogger().debug(" ".join(cmd)) start = time.time() subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) align_time = time.time() - start - logging.getLogger().info(f"Done aligning sequences in {round(align_time,2)} seconds") + logging.getLogger().info(f"Done aligning sequences in {round(align_time, 2)} seconds") - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, prefix="aln_result_db_file", suffix = ".tsv", delete=False) as outfile: - cmd = ["mmseqs", "convertalis", query_db.as_posix(), target_db.as_posix(), aln_db.name, outfile.name, "--format-mode", "2"] + with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, prefix="aln_result_db_file", suffix=".tsv", + delete=False) as outfile: + cmd = ["mmseqs", "convertalis", query_db.as_posix(), target_db.as_posix(), aln_db.name, outfile.name, + "--format-mode", "2"] logging.getLogger().info("Extracting alignments...") logging.getLogger().debug(" ".join(cmd)) @@ -130,7 +135,8 @@ def align_seq_to_pang(target_seq_file:Path , query_seq_files: Iterable[Path], return Path(outfile.name) -def map_input_gene_to_family_all_aln(aln_res: Path, outdir:Path, pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], Path]: +def map_input_gene_to_family_all_aln(aln_res: Path, outdir: Path, + pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], Path]: """ Read alignment result to link input sequences to pangenome gene family. Alignment have been made against all genes of the pangenome. @@ -143,15 +149,15 @@ def map_input_gene_to_family_all_aln(aln_res: Path, outdir:Path, pangenome: Pang """ seq2pang = {} - aln_file_clean = outdir / f"alignment_input_seqs_to_all_pangenome_genes.tsv" # write the actual result file + aln_file_clean = outdir / "alignment_input_seqs_to_all_pangenome_genes.tsv" # write the actual result file logging.getLogger().debug(f'Writing alignment file in {aln_file_clean}') - + with open(aln_res, "r") as alnFile, open(aln_file_clean, "w") as aln_outfl: for line in alnFile: line_splitted = line.split() - + line_splitted[1] = line_splitted[1].replace("ppanggolin_", "") # remove the 'ppanggolin_' bit of the id - line_splitted[0] = line_splitted[0].replace("ppanggolin_", "") + line_splitted[0] = line_splitted[0].replace("ppanggolin_", "") input_seq_id, gene_id = line_splitted[0:2] @@ -164,7 +170,8 @@ def map_input_gene_to_family_all_aln(aln_res: Path, outdir:Path, pangenome: Pang return seq2pang, aln_file_clean -def map_input_gene_to_family_rep_aln(aln_res: Path, outdir:Path, pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], str]: +def map_input_gene_to_family_rep_aln(aln_res: Path, outdir: Path, + pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], str]: """ Read alignment result to link input sequences to pangenome gene family. Alignment have been made against representative sequence of gene families of the pangenome. @@ -176,14 +183,14 @@ def map_input_gene_to_family_rep_aln(aln_res: Path, outdir:Path, pangenome: Pang :return: Dictionnary with sequence link to pangenome gene families and actual path to the cleaned alignment file """ seq2pang = {} - aln_file_clean = outdir / f"alignment_input_seqs_to_pangenome_gene_families.tsv" # write the actual result file + aln_file_clean = outdir / "alignment_input_seqs_to_pangenome_gene_families.tsv" # write the actual result file logging.getLogger().debug(f'Writing alignment file in {aln_file_clean}') with open(aln_res, "r") as alnFile, open(aln_file_clean, "w") as aln_outfl: for line in alnFile: line_splitted = line.split() - + line_splitted[1] = line_splitted[1].replace("ppanggolin_", "") # remove the 'ppanggolin_' bit of the id line_splitted[0] = line_splitted[0].replace("ppanggolin_", "") @@ -198,7 +205,6 @@ def map_input_gene_to_family_rep_aln(aln_res: Path, outdir:Path, pangenome: Pang return seq2pang, aln_file_clean - def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool]: """ Get sequence IDs from a sequence input file in FASTA format and guess the sequence type based on the first sequences. @@ -211,7 +217,7 @@ def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool]: seq_set = set() seq_count = 0 first_seq_concat = "" - + for line in seq_file: if line.startswith(">"): seq_set.add(line[1:].split()[0].strip()) @@ -221,12 +227,11 @@ def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool]: char_counter = Counter(first_seq_concat) is_nucleotide = all(char in dna_expected_char for char in char_counter) - - return seq_set, is_nucleotide + return seq_set, is_nucleotide -def write_gene_fam_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", disable_bar:bool=False): +def write_gene_fam_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", disable_bar: bool = False): """ Export the sequence of gene families @@ -235,12 +240,14 @@ def write_gene_fam_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", :param add: Add prefix to sequence name :param disable_bar: disable progress bar """ - for fam in tqdm(pangenome.gene_families, unit="families", disable=disable_bar, total=pangenome.number_of_gene_families): + for fam in tqdm(pangenome.gene_families, unit="families", disable=disable_bar, + total=pangenome.number_of_gene_families): file_obj.write(">" + add + fam.name + "\n") file_obj.write(fam.sequence + "\n") # file_obj.flush() -def write_all_gene_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", disable_bar:bool = False): + +def write_all_gene_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", disable_bar: bool = False): """ Export the sequence of pangenome genes @@ -257,6 +264,7 @@ def write_all_gene_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", # this should never happen if the pangenome has been properly checked before launching this function. raise Exception("The pangenome does not include gene sequences") + def project_and_write_partition(seqid_to_gene_family: Dict[str, GeneFamily], seq_set: Set[str], output: Path) -> Path: """ Project the partition of each sequence from the input file and write them in a file @@ -272,10 +280,11 @@ def project_and_write_partition(seqid_to_gene_family: Dict[str, GeneFamily], seq with open(partition_proj, "w") as partProjFile: for input_seq, pangFam in seqid_to_gene_family.items(): partProjFile.write(input_seq + "\t" + pangFam.named_partition + "\n") - for remainingSeq in seq_set - seqid_to_gene_family.keys(): + for remainingSeq in seq_set - seqid_to_gene_family.keys(): partProjFile.write(remainingSeq + "\tcloud\n") # if there is no hit, it's going to be cloud genes. return partition_proj + def write_gene_to_gene_family(seqid_to_gene_family: Dict[str, GeneFamily], seq_set: Set[str], output: Path) -> Path: """ Write input gene to gene family. @@ -292,7 +301,7 @@ def write_gene_to_gene_family(seqid_to_gene_family: Dict[str, GeneFamily], seq_s for input_seq, pangFam in seqid_to_gene_family.items(): partProjFile.write(f"{input_seq}\t{pangFam.name}\n") - for remainingSeq in seq_set - seqid_to_gene_family.keys(): + for remainingSeq in seq_set - seqid_to_gene_family.keys(): partProjFile.write(f"{remainingSeq}\t{remainingSeq}\n") # if there is no hit, gene family is itself. return gene_fam_map_file @@ -420,10 +429,11 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel logging.getLogger("PPanGGOLiN").info(f"File listing RGP and spots where sequences of interest are located : " f"{output / 'info_input_seq.tsv'}") -def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Iterable[Path], output: Path, tmpdir: Path, is_input_seq_nt:bool, - cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, - coverage: float = 0.8, translation_table: int = 11, disable_bar:bool = False) -> Tuple[Path, dict]: - + +def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Iterable[Path], output: Path, + tmpdir: Path, is_input_seq_nt: bool, cpu: int = 1, no_defrag: bool = False, + identity: float = 0.8, coverage: float = 0.8, translation_table: int = 11, + disable_bar: bool = False) -> Tuple[Path, Dict[str, GeneFamily]]: """ Assign gene families from a pangenome to input sequences. @@ -447,25 +457,26 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Itera """ # delete False to be able to keep tmp file. If they are not keep tmpdir will be destroyed so no need to delete tmpfile - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, + with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, prefix="representative_genes", suffix=".faa") as tmp_pang_file: - logging.getLogger().debug(f'Write gene family sequences in {tmp_pang_file.name}') write_gene_fam_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar) align_file = align_seq_to_pang(target_seq_file=Path(tmp_pang_file.name), query_seq_files=sequence_files, - tmpdir=tmpdir, cpu=cpu, - no_defrag=no_defrag, identity=identity, coverage=coverage, - is_query_nt=is_input_seq_nt, is_target_nt=False, - translation_table=translation_table) + tmpdir=tmpdir, cpu=cpu, + no_defrag=no_defrag, identity=identity, coverage=coverage, + is_query_nt=is_input_seq_nt, is_target_nt=False, + translation_table=translation_table) seq2pang, align_file = map_input_gene_to_family_rep_aln(align_file, output, pangenome) return align_file, seq2pang -def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Iterable[Path], output: Path, tmpdir: Path, is_input_seq_nt:bool, - cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, - translation_table: int = 11, disable_bar:bool = False) -> Tuple[set, str, dict]: + +def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Iterable[Path], output: Path, + tmpdir: Path, is_input_seq_nt: bool, cpu: int = 1, no_defrag: bool = False, + identity: float = 0.8, coverage: float = 0.8, translation_table: int = 11, + disable_bar: bool = False) -> Tuple[Path, Dict[str, GeneFamily]]: """ Assign gene families from a pangenome to input sequences. @@ -488,27 +499,26 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Itera and a dictionary mapping input sequences to gene families. """ - - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, + with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, prefix="all_pangenome_genes", suffix=".fna") as tmp_pang_file: - logging.getLogger().debug(f'Write all pangenome gene sequences in {tmp_pang_file.name}') write_all_gene_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar) align_file = align_seq_to_pang(target_seq_file=Path(tmp_pang_file.name), query_seq_files=sequence_files, - tmpdir=tmpdir, cpu=cpu, - no_defrag=no_defrag, identity=identity, coverage=coverage, - is_query_nt=is_input_seq_nt, is_target_nt=True, - translation_table=translation_table ) + tmpdir=tmpdir, cpu=cpu, + no_defrag=no_defrag, identity=identity, coverage=coverage, + is_query_nt=is_input_seq_nt, is_target_nt=True, + translation_table=translation_table) seq2pang, align_file = map_input_gene_to_family_all_aln(align_file, output, pangenome) return align_file, seq2pang + def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: float = 0.8, coverage: float = 0.8, no_defrag: bool = False, cpu: int = 1, getinfo: bool = False, use_representatives: bool = False, - draw_related: bool = False, translation_table:int=11, tmpdir: Path = None, + draw_related: bool = False, translation_table: int = 11, tmpdir: Path = None, disable_bar: bool = False, keep_tmp=False): """ Aligns pangenome sequences with sequences in a FASTA file using MMSeqs2. @@ -529,7 +539,6 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo :param keep_tmp: If True, keep temporary files. """ - tmpdir = Path(tempfile.gettempdir()) if tmpdir is None else tmpdir if pangenome.status["geneFamilySequences"] not in ["inFile", "Loaded", "Computed"]: raise Exception("Cannot use this function as your pangenome does not have gene families representatives " @@ -546,22 +555,27 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo need_spots=True, need_modules=need_mod, disable_bar=disable_bar) else: check_pangenome_info(pangenome, need_families=True, disable_bar=disable_bar) - + with read_compressed_or_not(sequence_file) as seqFileObj: seq_set, is_nucleotide = get_seq_ids(seqFileObj) with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: - if use_representatives: - align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, [sequence_file], output, new_tmpdir, is_input_seq_nt=is_nucleotide, - cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, - translation_table=translation_table, disable_bar=disable_bar) + align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, [sequence_file], output, new_tmpdir, + is_input_seq_nt=is_nucleotide, + cpu=cpu, no_defrag=no_defrag, identity=identity, + coverage=coverage, + translation_table=translation_table, + disable_bar=disable_bar) else: align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=[sequence_file], - output=output, tmpdir=new_tmpdir, is_input_seq_nt=is_nucleotide, - cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, - translation_table=translation_table, disable_bar=disable_bar) + output=output, tmpdir=new_tmpdir, + is_input_seq_nt=is_nucleotide, + cpu=cpu, no_defrag=no_defrag, identity=identity, + coverage=coverage, + translation_table=translation_table, + disable_bar=disable_bar) if getinfo or draw_related: # TODO Add getinfo to function and remove if get_seq_info(seq2pang, pangenome, output, draw_related, disable_bar=disable_bar) @@ -585,7 +599,7 @@ def launch(args: argparse.Namespace): tmpdir=args.tmpdir, identity=args.identity, coverage=args.coverage, no_defrag=args.no_defrag, cpu=args.cpu, getinfo=args.getinfo, use_representatives=args.fast, draw_related=args.draw_related, - translation_table=args.translation_table, + translation_table=args.translation_table, disable_bar=args.disable_prog_bar, keep_tmp=args.keep_tmp) @@ -626,8 +640,8 @@ def parser_align(parser: argparse.ArgumentParser): optional.add_argument('--coverage', required=False, type=float, default=0.8, help="min coverage percentage threshold") optional.add_argument("--fast", required=False, action="store_true", - help="Use representative sequences of gene families for input gene alignment. " - "This option is faster but may be less sensitive. By default, all pangenome genes are used.") + help="Use representative sequences of gene families for input gene alignment. " + "This option is faster but may be less sensitive. By default, all pangenome genes are used.") optional.add_argument("--translation_table", required=False, default="11", help="Translation table (genetic code) to use.") optional.add_argument("--getinfo", required=False, action="store_true", @@ -644,11 +658,12 @@ def parser_align(parser: argparse.ArgumentParser): optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", - help="Keeping temporary files (useful for debugging).") + help="Keeping temporary files (useful for debugging).") + if __name__ == '__main__': """To test local change and allow using debugger""" - from ppanggolin.utils import check_log, set_verbosity_level, add_common_arguments + from ppanggolin.utils import set_verbosity_level, add_common_arguments main_parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 6d8188d7..1ca725c7 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -21,10 +21,10 @@ # local libraries from ppanggolin.formats import check_pangenome_info from ppanggolin.genome import Gene, Contig -from ppanggolin.utils import mk_outdir, restricted_float, add_gene, connected_components, create_tmpdir, read_compressed_or_not -from ppanggolin.geneFamily import GeneFamily +from ppanggolin.utils import mk_outdir, restricted_float, create_tmpdir, read_compressed_or_not from ppanggolin.pangenome import Pangenome -from ppanggolin.align.alignOnPang import project_and_write_partition, get_input_seq_to_family_with_rep, get_input_seq_to_family_with_all, get_seq_ids +from ppanggolin.align.alignOnPang import project_and_write_partition, get_input_seq_to_family_with_rep, \ + get_input_seq_to_family_with_all, get_seq_ids from ppanggolin.region import GeneContext from ppanggolin.geneFamily import GeneFamily from ppanggolin.projection.projection import write_gene_to_gene_family @@ -32,9 +32,11 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: Path, sequence_file: Path = None, families: Path = None, transitive: int = 4, identity: float = 0.5, - coverage: float = 0.8, use_representatives: bool = False, jaccard_threshold: float = 0.85, + coverage: float = 0.8, use_representatives: bool = False, + jaccard_threshold: float = 0.85, window_size: int = 1, no_defrag: bool = False, - cpu: int = 1, graph_format:str = "graphml", disable_bar=True, translation_table:int=11, keep_tmp:bool = False): + cpu: int = 1, graph_format: str = "graphml", disable_bar=True, + translation_table: int = 11, keep_tmp: bool = False): """ Main function to search common gene contexts between sequence set and pangenome families @@ -53,6 +55,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: :param cpu: Number of core used to process :param graph_format: Write format of the context graph. Can be graphml or gexf :param disable_bar: Allow preventing bar progress print + :param translation_table: The translation table to use when the input sequences are nucleotide sequences. :param keep_tmp: If True, keep temporary files. """ # check statuses and load info @@ -63,77 +66,82 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) families_of_interest = set() - fam_2_seq = None - + fam2seq = {} if sequence_file is not None: # Alignment of sequences on pangenome families with read_compressed_or_not(sequence_file) as seqFileObj: seq_set, is_nucleotide = get_seq_ids(seqFileObj) - logging.debug(f"Input sequences are {'nucletide' if is_nucleotide else 'protein'} sequences") + logging.debug(f"Input sequences are {'nucleotide' if is_nucleotide else 'protein'} sequences") with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: - + if use_representatives: - _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome, [sequence_file], output, new_tmpdir, is_input_seq_nt=is_nucleotide, - cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, - translation_table=translation_table, disable_bar=disable_bar) + _, seqid2fam = get_input_seq_to_family_with_rep(pangenome, [sequence_file], output, + new_tmpdir, is_input_seq_nt=is_nucleotide, + cpu=cpu, no_defrag=no_defrag, + identity=identity, coverage=coverage, + translation_table=translation_table, + disable_bar=disable_bar) else: - _, seqid_to_gene_family = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=[sequence_file], - output=output, tmpdir=new_tmpdir, is_input_seq_nt=is_nucleotide, - cpu=cpu, no_defrag=no_defrag, - identity=identity, coverage=coverage, - translation_table=translation_table, disable_bar=disable_bar) - - project_and_write_partition(seqid_to_gene_family, seq_set, output) - write_gene_to_gene_family(seqid_to_gene_family, seq_set, output) - - for pan_family in seqid_to_gene_family.values(): + _, seqid2fam = get_input_seq_to_family_with_all(pangenome=pangenome, + sequence_files=[sequence_file], + output=output, tmpdir=new_tmpdir, + is_input_seq_nt=is_nucleotide, + cpu=cpu, no_defrag=no_defrag, + identity=identity, coverage=coverage, + translation_table=translation_table, + disable_bar=disable_bar) + + project_and_write_partition(seqid2fam, seq_set, output) + write_gene_to_gene_family(seqid2fam, seq_set, output) + fam2seq = {gf.ID: seqid for seqid, gf in seqid2fam} + for pan_family in seqid2fam.values(): families_of_interest.add(pan_family) - if families is not None: with read_compressed_or_not(families) as f: for fam_name in f.read().splitlines(): families_of_interest.add(pangenome.get_gene_family(fam_name)) - # Compute the graph with transitive closure size provided as parameter start_time = time.time() logging.getLogger().info("Building the graph...") - - gene_context_graph = compute_gene_context_graph(families=families_of_interest, transitive=transitive, + + gene_context_graph = compute_gene_context_graph(families=families_of_interest, transitive=transitive, window_size=window_size, disable_bar=disable_bar) - + logging.getLogger().info( f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find common gene contexts") - - logging.getLogger().debug(f"Context graph made of {nx.number_of_nodes(gene_context_graph)} nodes and {nx.number_of_edges(gene_context_graph)} edges") + + logging.getLogger().debug( + f"Context graph made of {nx.number_of_nodes(gene_context_graph)} nodes and {nx.number_of_edges(gene_context_graph)} edges") compute_edge_metrics(gene_context_graph, jaccard_threshold) # Filter graph filter_flag = f'is_jaccard_gene_>_{jaccard_threshold}' - - edges_to_remove = [(n,v) for n,v,d in gene_context_graph.edges(data=True) if not d[filter_flag]] + + edges_to_remove = [(n, v) for n, v, d in gene_context_graph.edges(data=True) if not d[filter_flag]] gene_context_graph.remove_edges_from(edges_to_remove) logging.getLogger().debug(f"Filtering context graph on {filter_flag}") - logging.getLogger().debug(f"Context graph made of {nx.number_of_nodes(gene_context_graph)} nodes and {nx.number_of_edges(gene_context_graph)} edges") + logging.getLogger().debug( + f"Context graph made of {nx.number_of_nodes(gene_context_graph)} nodes and {nx.number_of_edges(gene_context_graph)} edges") gene_contexts = get_gene_contexts(gene_context_graph, families_of_interest) - gene_context_graph = make_graph_writable(gene_context_graph) out_graph_file = write_graph(gene_context_graph, output, graph_format) if len(gene_contexts) != 0: - logging.getLogger().info(f"There are {sum((len(gc) for gc in gene_contexts))} families among {len(gene_contexts)} gene contexts") - - output_file = os.path.join(output, "gene_contexts.tsv") + logging.getLogger().info( + f"There are {sum((len(gc) for gc in gene_contexts))} families among {len(gene_contexts)} gene contexts") - export_context_to_dataframe(gene_contexts, fam_2_seq, output_file) + output_file = output / "gene_contexts.tsv" + + export_context_to_dataframe(gene_contexts, fam2seq, output_file) else: logging.getLogger("PPanGGOLiN").info("No gene contexts were found") @@ -164,25 +172,26 @@ def get_gene_contexts(context_graph: nx.Graph, families_of_interest: Set[GeneFam # Connected component graph Filtering # remove singleton famillies - connected_components = (component for component in connected_components if len(component) > 1) + connected_components = (component for component in connected_components if len(component) > 1) # remove component made only of famillies not initially requested connected_components = (component for component in connected_components if component & families_of_interest) gene_contexts = set() families_in_context = set() - + for i, component in enumerate(connected_components): families_in_context |= component family_of_interest_of_gc = component & families_of_interest gene_context = GeneContext(gc_id=i, families=component, families_of_interest=family_of_interest_of_gc) - + # add gc id to node attribute - node_attributes = {n:{"gene_context_id":i, "families_of_interest": n in families_of_interest} for n in component} + node_attributes = {n: {"gene_context_id": i, "families_of_interest": n in families_of_interest} for n in + component} nx.set_node_attributes(context_graph, node_attributes) gene_contexts.add(gene_context) - + node_not_in_context = set(context_graph.nodes()) - families_in_context context_graph.remove_nodes_from(node_not_in_context) @@ -190,67 +199,64 @@ def get_gene_contexts(context_graph: nx.Graph, families_of_interest: Set[GeneFam def make_graph_writable(context_graph): - """ + The original context graph contains ppanggolin objects as nodes and lists and dictionaries in edge attributes. + Since these objects cannot be written to the output graph, + this function creates a new graph that contains only writable objects. - The original context graph contains - ppanggolin objects as nodes and lists and dictionaries in edge attributes. Since these objects - cannot be written to the output graph, this function creates a new graph that contains only - writable objects. - - :param gene_contexts: List of gene context. it includes graph of the context - + :param context_graph: List of gene context. it includes graph of the context """ - - def filter_attribute(data:dict): + def filter_attribute(data: dict): """ Helper function to filter the edge attributes. :param data: The edge attribute data. :return: A filtered dictionary containing only non-collection attributes. """ - return {k:v for k, v in data.items() if type(v) not in [set, dict, list]} + return {k: v for k, v in data.items() if type(v) not in [set, dict, list]} G = nx.Graph() - G.add_edges_from((f1.name, f2.name, filter_attribute(d)) for f1, f2, d in context_graph.edges(data=True)) - + G.add_edges_from((f1.name, f2.name, filter_attribute(d)) for f1, f2, d in context_graph.edges(data=True)) # convert transitivity dict to str - edges_with_transitivity_str = {(f1.name, f2.name):str(d['transitivity']) for f1, f2, d in context_graph.edges(data=True)} + edges_with_transitivity_str = {(f1.name, f2.name): str(d['transitivity']) for f1, f2, d in + context_graph.edges(data=True)} nx.set_edge_attributes(G, edges_with_transitivity_str, name="transitivity") - nodes_attributes_filtered = {f.name:filter_attribute(d) for f,d in context_graph.nodes(data=True)} + nodes_attributes_filtered = {f.name: filter_attribute(d) for f, d in context_graph.nodes(data=True)} # on top of attributes already contained in node of context graph # add organisms and genes count that have the family, the partition and if the family was in initially requested - nodes_family_data = {f.name:{"organisms": f.number_of_organisms, - "partition": f.named_partition, - "genes": f.number_of_genes} for f in context_graph.nodes()} - + nodes_family_data = {f.name: {"organisms": f.number_of_organisms, + "partition": f.named_partition, + "genes": f.number_of_genes} for f in context_graph.nodes()} + for f, d in G.nodes(data=True): d.update(nodes_family_data[f]) d.update(nodes_attributes_filtered[f]) return G -def write_graph(G:nx.Graph, output_dir: str, graph_format:str): + +def write_graph(G: nx.Graph, output_dir: Path, graph_format: str): """ Write a graph to file in the GraphML format or/and in GEXF format. + :param G: Graph to write :param output_dir: The output directory where the graph file will be written. :param graph_format: Formats of the output graph. Can be graphml or gexf """ if "graphml" == graph_format: - out_file = os.path.join(output_dir, "graph_context.graphml") + out_file = output_dir / "graph_context.graphml" logging.info(f'Writting context graph in {out_file}') nx.write_graphml_lxml(G, out_file) elif "gexf" == graph_format: - out_file = os.path.join(output_dir, "graph_context.gexf") + out_file = output_dir / "graph_context.gexf" logging.info(f'Writting context graph in {out_file}') nx.readwrite.gexf.write_gexf(G, out_file) else: @@ -258,6 +264,7 @@ def write_graph(G:nx.Graph, output_dir: str, graph_format:str): return out_file + def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff: float) -> None: """ Compute various metrics on the edges of the context graph. @@ -267,25 +274,27 @@ def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff: float) """ # compute jaccard on organism and on genes for f1, f2, data in context_graph.edges(data=True): - - data['jaccard_organism'] = len(data['organisms'])/len(set(f1.organisms) | set(f2.organisms)) - - f1_gene_proportion = len(data['genes'][f1])/f1.number_of_genes - f2_gene_proportion = len(data['genes'][f2])/f2.number_of_genes - - data[f'f1'] = f1.name - data[f'f2'] = f2.name - data[f'f1_jaccard_gene'] = f1_gene_proportion - data[f'f2_jaccard_gene'] = f2_gene_proportion - - data[f'is_jaccard_gene_>_{gene_proportion_cutoff}'] = (f1_gene_proportion >= gene_proportion_cutoff) and (f2_gene_proportion >= gene_proportion_cutoff) + data['jaccard_organism'] = len(data['organisms']) / len(set(f1.organisms) | set(f2.organisms)) + + f1_gene_proportion = len(data['genes'][f1]) / f1.number_of_genes + f2_gene_proportion = len(data['genes'][f2]) / f2.number_of_genes + + data['f1'] = f1.name + data['f2'] = f2.name + data['f1_jaccard_gene'] = f1_gene_proportion + data['f2_jaccard_gene'] = f2_gene_proportion + + data[f'is_jaccard_gene_>_{gene_proportion_cutoff}'] = (f1_gene_proportion >= gene_proportion_cutoff) and ( + f2_gene_proportion >= gene_proportion_cutoff) transitivity_counter = data['transitivity'] - mean_transitivity = sum((transitivity*counter for transitivity, counter in transitivity_counter.items()))/sum((counter for counter in transitivity_counter.values())) + mean_transitivity = sum( + (transitivity * counter for transitivity, counter in transitivity_counter.items())) / sum( + (counter for counter in transitivity_counter.values())) data['mean_transitivity'] = mean_transitivity - + # the following commented out lines are additional metrics that could be used # data['min_jaccard_organism'] = len(data['organisms'])/min(len(f1.organisms), len(f2.organisms)) @@ -297,7 +306,7 @@ def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff: float) def add_edges_to_context_graph(context_graph: nx.Graph, - contig_genes: Iterable[Gene], + contig_genes: List[Gene], contig_windows: List[Tuple[int, int]], transitivity: int, is_circular: bool): @@ -314,7 +323,7 @@ def add_edges_to_context_graph(context_graph: nx.Graph, for window_start, window_end in contig_windows: for gene_index in range(window_start, window_end + 1): gene = contig_genes[gene_index] - next_genes = get_n_next_genes_index(gene_index, next_genes_count=transitivity+1, + next_genes = get_n_next_genes_index(gene_index, next_genes_count=transitivity + 1, contig_size=len(contig_genes), is_circular=is_circular) next_genes = list(next_genes) @@ -324,26 +333,25 @@ def add_edges_to_context_graph(context_graph: nx.Graph, # next_gene_index is not in any range of genes in the context # so it is ignored along with all following genes break - + next_gene = contig_genes[next_gene_index] if next_gene.family == gene.family: # If the next gene has the same family, the two genes refer to the same node # so they are ignored continue - + context_graph.add_edge(gene.family, next_gene.family) edge_dict = context_graph[gene.family][next_gene.family] if i == 0: edge_dict['adjacent_family'] = True - + # Store information of the transitivity used to link the two genes: if "transitivity" not in edge_dict: - edge_dict['transitivity'] = {i:0 for i in range(transitivity +1)} + edge_dict['transitivity'] = {i: 0 for i in range(transitivity + 1)} edge_dict['transitivity'][i] += 1 - # Add node attributes node_gene_dict = context_graph.nodes[gene.family] next_gene_gene_dict = context_graph.nodes[next_gene.family] @@ -354,22 +362,21 @@ def add_edges_to_context_graph(context_graph: nx.Graph, add_val_to_dict_attribute(node_gene_dict, "genes", gene) add_val_to_dict_attribute(next_gene_gene_dict, "genes", next_gene) - # Add edge attributes edge_dict = context_graph[gene.family][next_gene.family] try: genes_edge_dict = edge_dict['genes'] - except: + except Exception: genes_edge_dict = {} edge_dict['genes'] = genes_edge_dict - + add_val_to_dict_attribute(genes_edge_dict, gene.family, gene) add_val_to_dict_attribute(genes_edge_dict, next_gene.family, next_gene) add_val_to_dict_attribute(edge_dict, "organisms", gene.organism) increment_attribute_counter(edge_dict, "gene_pairs") - + assert gene.organism == next_gene.organism, f"Gene of the same contig have a different organism. {gene.organism} and {next_gene.organism}" @@ -389,7 +396,7 @@ def add_val_to_dict_attribute(attr_dict: dict, attribute_key, attribute_value): attr_dict[attribute_key] = {attribute_value} -def increment_attribute_counter(edge_dict: dict, key:Hashable): +def increment_attribute_counter(edge_dict: dict, key: Hashable): """ Increment the counter for an edge/node attribute in the edge/node dictionary. @@ -404,7 +411,8 @@ def increment_attribute_counter(edge_dict: dict, key:Hashable): edge_dict[key] = 1 -def get_n_next_genes_index(current_index: int, next_genes_count: int, contig_size: int, is_circular: bool = False) -> Iterator[int]: +def get_n_next_genes_index(current_index: int, next_genes_count: int, + contig_size: int, is_circular: bool = False) -> Iterator[int]: """ Generate the indices of the next genes based on the current index and contig properties. @@ -424,17 +432,18 @@ def get_n_next_genes_index(current_index: int, next_genes_count: int, contig_siz raise IndexError(f'current gene index is out of range. ' f"Contig has {contig_size} genes while the given gene index is {current_index}") if is_circular: - next_genes = chain(range(current_index+1, contig_size), range(0, current_index)) + next_genes = chain(range(current_index + 1, contig_size), range(0, current_index)) else: - next_genes = range(current_index+1, contig_size) - + next_genes = range(current_index + 1, contig_size) + for i, next_gene_index in enumerate(next_genes): if i == next_genes_count: break yield next_gene_index - -def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int], window_size: int, is_circular:bool = False): + +def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int], window_size: int, + is_circular: bool = False): """ Extracts contiguous windows around positions of interest within a contig. @@ -450,7 +459,7 @@ def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int] sorted_positions = sorted(positions_of_interest) # Check if any position of interest is out of range - if sorted_positions[0] <0 or sorted_positions[-1] >= contig_size: + if sorted_positions[0] < 0 or sorted_positions[-1] >= contig_size: raise IndexError(f'Positions of interest are out of range. ' f"Contig has {contig_size} genes while given min={sorted_positions[0]} & max={sorted_positions[-1]} positions") @@ -462,33 +471,34 @@ def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int] # the returned window are always checked that its positions are not out of range... # so there's no chance to find an out of scope position in final list if first_position - window_size < 0: - out_of_scope_position = (contig_size ) + first_position + out_of_scope_position = contig_size + first_position sorted_positions.append(out_of_scope_position) - - if last_position + window_size >= contig_size : + + if last_position + window_size >= contig_size: out_of_scope_position = last_position - contig_size sorted_positions.insert(0, out_of_scope_position) - + start_po = max(sorted_positions[0] - window_size, 0) - + for position, next_po in zip_longest(sorted_positions, sorted_positions[1:]): - + if next_po is None: # If there are no more positions, add the final window - end_po = min(position + window_size, contig_size-1) + end_po = min(position + window_size, contig_size - 1) windows_coordinates.append((start_po, end_po)) - - elif position + window_size +1 < next_po - window_size: + + elif position + window_size + 1 < next_po - window_size: # If there is a gap between positions, add the current window # and update the start position for the next window - end_po = min(position + window_size, contig_size-1) - + end_po = min(position + window_size, contig_size - 1) + windows_coordinates.append((start_po, end_po)) - + start_po = max(next_po - window_size, 0) - + return windows_coordinates + def get_contig_to_genes(gene_families: Iterable[GeneFamily]) -> Dict[Contig, Set[Gene]]: """ Group genes from specified gene families by contig. @@ -497,7 +507,7 @@ def get_contig_to_genes(gene_families: Iterable[GeneFamily]) -> Dict[Contig, Set :return: A dictionary mapping contigs to sets of genes. """ - + contig_to_genes_of_interest = defaultdict(set) for gene_family in gene_families: for gene in gene_family.genes: @@ -506,7 +516,8 @@ def get_contig_to_genes(gene_families: Iterable[GeneFamily]) -> Dict[Contig, Set return contig_to_genes_of_interest -def compute_gene_context_graph(families: Iterable[GeneFamily], transitive: int = 4, window_size: int = 0, disable_bar: bool = False) -> nx.Graph: +def compute_gene_context_graph(families: Iterable[GeneFamily], transitive: int = 4, window_size: int = 0, + disable_bar: bool = False) -> nx.Graph: """ Construct the graph of gene contexts between families of the pangenome. @@ -519,28 +530,27 @@ def compute_gene_context_graph(families: Iterable[GeneFamily], transitive: int = """ context_graph = nx.Graph() - + contig_to_genes_of_interest = get_contig_to_genes(families) - - for contig, genes_of_interest in tqdm(contig_to_genes_of_interest.items(), unit="contig", total=len(contig_to_genes_of_interest), disable=disable_bar): - + + for contig, genes_of_interest in tqdm(contig_to_genes_of_interest.items(), unit="contig", + total=len(contig_to_genes_of_interest), disable=disable_bar): genes_count = contig.number_of_genes - + genes_of_interest_positions = [g.position for g in genes_of_interest] - contig_windows = extract_contig_window(genes_count, genes_of_interest_positions, - window_size=window_size, is_circular=contig.is_circular) - + contig_windows = extract_contig_window(genes_count, genes_of_interest_positions, + window_size=window_size, is_circular=contig.is_circular) + add_edges_to_context_graph(context_graph, - contig.get_genes(), - contig_windows, - transitive, - contig.is_circular) + contig.get_genes(), + contig_windows, + transitive, + contig.is_circular) return context_graph - -def fam2seq(seq_to_pan: dict) -> dict: +def fam_to_seq(seq_to_pan: dict) -> dict: """ Create a dictionary with gene families as keys and list of sequences id as values @@ -559,12 +569,12 @@ def fam2seq(seq_to_pan: dict) -> dict: return fam_2_seq -def export_context_to_dataframe(gene_contexts: set, fam_to_seq: dict, output: str): +def export_context_to_dataframe(gene_contexts: set, fam2seq: Dict[str, int], output: Path): """ Export the results into dataFrame - :param gene_contexts: connected components found in the pan - :param fam_to_seq: Dictionary with gene families as keys and list of sequence ids as values + :param gene_contexts: connected components found in the pangenome + :param fam2seq: Dictionary with gene families ID as keys and list of sequence ids as values :param output: output path """ @@ -572,20 +582,20 @@ def export_context_to_dataframe(gene_contexts: set, fam_to_seq: dict, output: st for gene_context in gene_contexts: for family in gene_context.families: - if fam_to_seq is None or fam_to_seq.get(family.ID) is None: + if fam2seq.get(family.ID) is None: sequence_id = None else: - sequence_id = ','.join(fam_to_seq.get(family.ID)) + sequence_id = ','.join(fam2seq.get(family.ID)) - family_info = {"GeneContext ID":gene_context.ID, + family_info = {"GeneContext ID": gene_context.ID, "Gene family name": family.name, - "Sequence ID":sequence_id, - "Nb Genomes":family.number_of_organisms, - "Partition": family.named_partition } + "Sequence ID": sequence_id, + "Nb Genomes": family.number_of_organisms, + "Partition": family.named_partition} lines.append(family_info) - + df = pd.DataFrame(lines).set_index("GeneContext ID") - + df = df.sort_values(["GeneContext ID", "Sequence ID"], na_position='last') df.to_csv(output, sep="\t", na_rep='NA') @@ -602,7 +612,7 @@ def launch(args: argparse.Namespace): if not any([args.sequences, args.family]): raise Exception("At least one of --sequences or --family option must be given") - + mk_outdir(args.output, args.force) pangenome = Pangenome() @@ -615,12 +625,13 @@ def launch(args: argparse.Namespace): check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar) - search_gene_context_in_pangenome(pangenome=pangenome, output=args.output, tmpdir=args.tmpdir, sequence_file=args.sequences, families=args.family, transitive=args.transitive, - identity=args.identity, coverage=args.coverage, use_representatives=args.fast, jaccard_threshold=args.jaccard, + identity=args.identity, coverage=args.coverage, use_representatives=args.fast, + jaccard_threshold=args.jaccard, window_size=args.window_size, - no_defrag=args.no_defrag, cpu=args.cpu, disable_bar=args.disable_prog_bar, graph_format=args.graph_format, + no_defrag=args.no_defrag, cpu=args.cpu, disable_bar=args.disable_prog_bar, + graph_format=args.graph_format, translation_table=args.translation_table, keep_tmp=args.keep_tmp) @@ -648,8 +659,9 @@ def parser_context(parser: argparse.ArgumentParser): required = parser.add_argument_group(title="Required arguments", description="All of the following arguments are required :") required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome.h5 file") - required.add_argument('-o', '--output', required=False, type=Path, default="ppanggolin_context" + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", - time.localtime()) + "_PID" + str(os.getpid()), + required.add_argument('-o', '--output', required=False, type=Path, + default="ppanggolin_context" + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", + time.localtime()) + "_PID" + str(os.getpid()), help="Output directory where the file(s) will be written") onereq = parser.add_argument_group(title="Input file", description="One of the following argument is required :") onereq.add_argument('-S', '--sequences', required=False, type=Path, @@ -662,10 +674,10 @@ def parser_context(parser: argparse.ArgumentParser): help="DO NOT Realign gene families to link fragments with" "their non-fragmented gene family.") optional.add_argument("--fast", required=False, action="store_true", - help="Use representative sequences of gene families for input gene alignment. " - "This option is recommended for faster processing but may be less sensitive. " - "By default, all pangenome genes are used for alignment. " - "This argument makes sense only when --sequence is provided.") + help="Use representative sequences of gene families for input gene alignment. " + "This option is recommended for faster processing but may be less sensitive. " + "By default, all pangenome genes are used for alignment. " + "This argument makes sense only when --sequence is provided.") optional.add_argument('--identity', required=False, type=float, default=0.8, help="min identity percentage threshold") optional.add_argument('--coverage', required=False, type=float, default=0.8, @@ -677,18 +689,19 @@ def parser_context(parser: argparse.ArgumentParser): "non related genes allowed in-between two related genes. Increasing it will improve " "precision but lower sensitivity a little.") optional.add_argument("-w", "--window_size", required=False, type=int, default=5, - help="Number of neighboring genes that are considered on each side of " - "a gene of interest when searching for conserved genomic contexts.") - + help="Number of neighboring genes that are considered on each side of " + "a gene of interest when searching for conserved genomic contexts.") + optional.add_argument("-s", "--jaccard", required=False, type=restricted_float, default=0.85, help="minimum jaccard similarity used to filter edges between gene families. Increasing it " "will improve precision but lower sensitivity a lot.") - optional.add_argument('--graph_format', help="Format of the context graph. Can be gexf or graphml.", default='graphml', choices=['gexf','graphml']) + optional.add_argument('--graph_format', help="Format of the context graph. Can be gexf or graphml.", + default='graphml', choices=['gexf', 'graphml']) optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", - help="Keeping temporary files (useful for debugging).") + help="Keeping temporary files (useful for debugging).") if __name__ == '__main__': diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 2c8cde9c..ef2e13b0 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -438,7 +438,9 @@ def remove(self, position): del self[position] def get_genes(self, begin: int = 0, end: int = None) -> List[Gene]: - """Gets a list of genes within a range. If argument is given it return all genes. + """ + Gets a list of genes within a range. + If no arguments are given it return all genes. :param begin: Position of the first gene to retrieve :param end: Position of the last gene to not retrieve diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 695ae608..6365e408 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -737,17 +737,24 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): class GeneContext: """ - A class used to represent a gene context which is a collection of gene families related to a specific genomic context.. + Represent a gene context which is a collection of gene families related to a specific genomic context.. - :param gc_id: Identifier of the gene context. - :param families: Gene families related to the gene context. - :param families_of_interest: Input families for which the context is being searched. + Methods + - families: Generator that yields all the gene families in the gene context. + - add_context_graph: Add a context graph corresponding to the gene context. + - add_family: Add a gene family to the gene context. + + Fields + - gc_id: The identifier of the gene context. + - graph: context graph corresponding to the gene context """ def __init__(self, gc_id: int, families: Set[GeneFamily] = None, families_of_interest: Set[GeneFamily] = None): """Constructor method - :param gc_id : Identifier of the Gene context - :param families: Gene families related to the GeneContext + + :param gc_id: Identifier of the gene context. + :param families: Gene families related to the gene context. + :param families_of_interest: Input families for which the context is being searched. """ if not isinstance(gc_id, int): @@ -756,7 +763,7 @@ def __init__(self, gc_id: int, families: Set[GeneFamily] = None, families_of_int self.ID = gc_id self._families_getter = {} self.families_of_interest = families_of_interest - self.graph = None + self._graph = None if families is not None: if not all(isinstance(fam, GeneFamily) for fam in families): raise Exception("You provided elements that were not GeneFamily objects. " @@ -846,15 +853,23 @@ def __delitem__(self, name): except KeyError: raise KeyError(f"There isn't gene family with the name {name} in the gene context") - def add_context_graph(self, graph: nx.Graph): + @property + def graph(self): + if self._graph is None: + raise ValueError("Graph has not been added to the context") + return self._graph + + @graph.setter + def graph(self, graph: nx.Graph): """ Add a context graph to the gene context. :param graph: The context graph. """ - self.graph = graph - - + if not isinstance(nx.Graph, graph): + logging.getLogger("PPanGGOLiN").debug(f"given type: {type(graph)}") + raise TypeError("Context graph must be a networkx graph object.") + self._graph = graph @property def families(self) -> Generator[GeneFamily, None, None]: @@ -864,7 +879,6 @@ def families(self) -> Generator[GeneFamily, None, None]: """ yield from self._families_getter.values() - def add_family(self, family: GeneFamily): """ Add a gene family to the gene context. @@ -874,4 +888,4 @@ def add_family(self, family: GeneFamily): if not isinstance(family, GeneFamily): raise Exception("You did not provide a GeneFamily object. " "GeneContexts are only made of GeneFamily objects.") - self.families.add(family) \ No newline at end of file + self[family.name] = family From 49d3cdf40ef6074b893fc44c475d1a44bb706331 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 13 Oct 2023 11:17:55 +0200 Subject: [PATCH 23/25] Fix bug in writting dict --- VERSION | 2 +- ppanggolin/context/searchGeneContext.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/VERSION b/VERSION index a11c24f7..e805773e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.192 +1.2.193 diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 1ca725c7..2ed6ae06 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -95,7 +95,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: project_and_write_partition(seqid2fam, seq_set, output) write_gene_to_gene_family(seqid2fam, seq_set, output) - fam2seq = {gf.ID: seqid for seqid, gf in seqid2fam} + fam2seq = {gf.ID: seqid for seqid, gf in seqid2fam.items()} for pan_family in seqid2fam.values(): families_of_interest.add(pan_family) From 765a24663855b646df0c9af0f7509682d867015e Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 13 Oct 2023 18:05:55 +0200 Subject: [PATCH 24/25] add target family column and fix sequence id --- docs/user/Genomic-context.md | 5 +++-- ppanggolin/context/searchGeneContext.py | 23 ++++++++++++++--------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/docs/user/Genomic-context.md b/docs/user/Genomic-context.md index 671b9e20..96b2e6d0 100644 --- a/docs/user/Genomic-context.md +++ b/docs/user/Genomic-context.md @@ -30,13 +30,14 @@ In this case, you can give a pangenome without gene families representatives seq In case of you are using families ID, you will only have as output the `gene_context.tsv` file. In the other case, you use sequences, you will have another output file to report the alignment between sequences and pangenome families (see detail in align subcommand). -There are 4 columns in `gene_context.tsv`. +There are 6 columns in `gene_context.tsv`. -1. **geneContext ID**: identifier of the found context. It is incrementally generated, beginning with 1 +1. **geneContext ID**: Identifier of the found context. It is incrementally generated, beginning with 1 2. **Gene family name**: Identifier of the gene family, from the pangenome, correspond to the found context 3. **Sequence ID**: Identifier of the searched sequence in the pangenome 4. **Nb Genomes**: Number of genomes where the genomic context is found 5. **Partition**: Partition of the gene family corresponding to the found context +6. **Target family**: Whether the family is a target family, meaning it matches an input sequence, or a family provided as input. In **sequence Id**, it is possible to find a NA value. This case, correspond to another gene family found in the context. diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 2ed6ae06..18fe83ce 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -66,7 +66,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) families_of_interest = set() - fam2seq = {} + family_2_input_seqid = None if sequence_file is not None: # Alignment of sequences on pangenome families with read_compressed_or_not(sequence_file) as seqFileObj: @@ -95,7 +95,11 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: project_and_write_partition(seqid2fam, seq_set, output) write_gene_to_gene_family(seqid2fam, seq_set, output) - fam2seq = {gf.ID: seqid for seqid, gf in seqid2fam.items()} + + family_2_input_seqid = defaultdict(set) + for seqid, gf in seqid2fam.items(): + family_2_input_seqid[gf].add(seqid) + for pan_family in seqid2fam.values(): families_of_interest.add(pan_family) @@ -140,8 +144,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: f"There are {sum((len(gc) for gc in gene_contexts))} families among {len(gene_contexts)} gene contexts") output_file = output / "gene_contexts.tsv" - - export_context_to_dataframe(gene_contexts, fam2seq, output_file) + export_context_to_dataframe(gene_contexts, family_2_input_seqid, families_of_interest, output_file) else: logging.getLogger("PPanGGOLiN").info("No gene contexts were found") @@ -569,29 +572,31 @@ def fam_to_seq(seq_to_pan: dict) -> dict: return fam_2_seq -def export_context_to_dataframe(gene_contexts: set, fam2seq: Dict[str, int], output: Path): +def export_context_to_dataframe(gene_contexts: set, fam2seq: Dict[str, int], families_of_interest: Set[GeneFamily], output: Path): """ Export the results into dataFrame :param gene_contexts: connected components found in the pangenome :param fam2seq: Dictionary with gene families ID as keys and list of sequence ids as values + :param families_of_interest: families of interest that are at the origine of the context. :param output: output path """ lines = [] for gene_context in gene_contexts: for family in gene_context.families: - - if fam2seq.get(family.ID) is None: + if fam2seq.get(family) is None: sequence_id = None else: - sequence_id = ','.join(fam2seq.get(family.ID)) + sequence_id = ','.join(fam2seq.get(family)) family_info = {"GeneContext ID": gene_context.ID, "Gene family name": family.name, "Sequence ID": sequence_id, "Nb Genomes": family.number_of_organisms, - "Partition": family.named_partition} + "Partition": family.named_partition, + "Target family": family in families_of_interest} + lines.append(family_info) df = pd.DataFrame(lines).set_index("GeneContext ID") From b878a4c649ca45657ae90ce53c29dfd19f5dd43f Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 13 Oct 2023 18:30:07 +0200 Subject: [PATCH 25/25] fix famseq wrong None type --- ppanggolin/context/searchGeneContext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 18fe83ce..fc3ded05 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -66,7 +66,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) families_of_interest = set() - family_2_input_seqid = None + family_2_input_seqid = {} if sequence_file is not None: # Alignment of sequences on pangenome families with read_compressed_or_not(sequence_file) as seqFileObj: