diff --git a/docs/developing_a_pheval_plugin.md b/docs/developing_a_pheval_plugin.md index d069caf2d..85a935ae2 100644 --- a/docs/developing_a_pheval_plugin.md +++ b/docs/developing_a_pheval_plugin.md @@ -242,6 +242,7 @@ The dataclasses representing essential information extracted from tool-specific show_source: true --- +For variant prioritisation results the `grouping_id` parameter is designed to handle compound heterozygous variants in ranking. Compound heterozygosity occurs when two or more variants, inherited together, contribute to a phenotype. For this purpose, variants that are part of the same compound heterozygous group (e.g., within the same gene) should be assigned the same `grouping_id`. This ensures they are ranked as a single entity, preserving their combined significance. Variants that are not part of any compound heterozygous group should each have a unique `grouping_id`. This approach prevents any unintended overlap in ranking and ensures that each group or individual variant is accurately represented. ::: src.pheval.post_processing.post_processing.PhEvalVariantResult handler: python options: diff --git a/src/pheval/post_processing/post_processing.py b/src/pheval/post_processing/post_processing.py index 920c7ae1d..609c98233 100644 --- a/src/pheval/post_processing/post_processing.py +++ b/src/pheval/post_processing/post_processing.py @@ -1,6 +1,6 @@ import logging import operator -from dataclasses import dataclass +from dataclasses import dataclass, field from enum import Enum from pathlib import Path from typing import List, Union @@ -96,6 +96,7 @@ class PhEvalVariantResult(PhEvalResult): ref: str alt: str score: float + grouping_id: str = field(default=None) @dataclass @@ -105,7 +106,7 @@ class RankedPhEvalVariantResult(PhEvalVariantResult): rank (int): The rank for the result entry """ - rank: int + rank: int = 0 @staticmethod def from_variant_result(pheval_variant_result: PhEvalVariantResult, rank: int): @@ -228,26 +229,57 @@ def sort_pheval_results(self) -> [PhEvalResult]: ) -def _rank_pheval_result(pheval_result: [PhEvalResult], sort_order: SortOrder) -> pd.DataFrame: - """ - Rank PhEval results post-processed from tool-specific output, managing tied scores (ex aequo) +class ResultRanker: + def __init__(self, pheval_result: List[PhEvalResult], sort_order: SortOrder): + """ + Initialise the PhEvalRanker. + Args: + pheval_result (List[PhEvalResult]): PhEval results to rank. + sort_order (SortOrder): Sorting order based on which ranking is performed. + """ + self.pheval_result = pheval_result + self.sort_order = sort_order + self.ascending = sort_order == SortOrder.ASCENDING - Args: - pheval_result ([PhEvalResult]): PhEval results obtained from tool-specific output - sort_order (SortOrder): Sorting order based on which ranking is performed + def rank(self) -> pd.DataFrame: + """ + Rank PhEval results, managing tied scores (ex aequo) and handling grouping_id if present. - Returns: - pd.DataFrame : Ranked PhEval results with tied scores managed + Returns: + pd.DataFrame : Ranked PhEval results with tied scores managed. + """ + pheval_result_df = pd.DataFrame([data.__dict__ for data in self.pheval_result]) - Raises: - ValueError: If an incompatible PhEval result type is encountered - """ - pheval_result_df = pd.DataFrame([data.__dict__ for data in pheval_result]) - if sort_order == SortOrder.ASCENDING: - pheval_result_df["rank"] = pheval_result_df["score"].rank(method="max", ascending=True) - elif sort_order == SortOrder.DESCENDING: - pheval_result_df["rank"] = pheval_result_df["score"].rank(method="max", ascending=False) - return pheval_result_df + if self._has_valid_grouping_id(pheval_result_df): + pheval_result_df = self._rank_with_grouping_id(pheval_result_df) + else: + pheval_result_df = self._rank_without_grouping_id(pheval_result_df) + return pheval_result_df.drop(columns=["min_rank", "grouping_id"], errors="ignore") + + @staticmethod + def _has_valid_grouping_id(pheval_result_df: pd.DataFrame) -> bool: + """Check if grouping_id exists and has no None values.""" + return ( + "grouping_id" in pheval_result_df.columns + and not pheval_result_df["grouping_id"].isnull().any() + ) + + def _rank_with_grouping_id(self, pheval_result_df: pd.DataFrame) -> pd.DataFrame: + """Apply ranking when grouping_id is present and has no None values.""" + pheval_result_df["min_rank"] = ( + pheval_result_df.groupby(["score", "grouping_id"]) + .ngroup() + .rank(method="dense", ascending=self.ascending) + ).astype(int) + pheval_result_df["rank"] = pheval_result_df.groupby("score")["min_rank"].transform("max") + return pheval_result_df + + def _rank_without_grouping_id(self, pheval_result_df: pd.DataFrame) -> pd.DataFrame: + """Apply ranking without using grouping_id.""" + pheval_result_df["rank"] = ( + pheval_result_df["score"].rank(method="max", ascending=self.ascending).astype(int) + ) + return pheval_result_df def _return_sort_order(sort_order_str: str) -> SortOrder: @@ -282,7 +314,7 @@ def _create_pheval_result(pheval_result: [PhEvalResult], sort_order_str: str) -> """ sort_order = _return_sort_order(sort_order_str) sorted_pheval_result = ResultSorter(pheval_result, sort_order).sort_pheval_results() - return _rank_pheval_result(sorted_pheval_result, sort_order) + return ResultRanker(sorted_pheval_result, sort_order).rank() def _write_pheval_gene_result( diff --git a/tests/test_post_processing.py b/tests/test_post_processing.py index f68d2fd73..f6cfbb749 100644 --- a/tests/test_post_processing.py +++ b/tests/test_post_processing.py @@ -9,10 +9,10 @@ RankedPhEvalDiseaseResult, RankedPhEvalGeneResult, RankedPhEvalVariantResult, + ResultRanker, ResultSorter, SortOrder, _create_pheval_result, - _rank_pheval_result, _return_sort_order, calculate_end_pos, ) @@ -217,41 +217,145 @@ def test_sort_pheval_results_pvalue(self): ) -class TestRankPhEvalResults(unittest.TestCase): +class TestResultRanker(unittest.TestCase): @classmethod def setUpClass(cls) -> None: - cls.sorted_gene_result = [ - PhEvalGeneResult( - gene_symbol="MAP3K14", gene_identifier="ENSG00000006062", score=0.9234 - ), - PhEvalGeneResult(gene_symbol="A4GNT", gene_identifier="ENSG00000118017", score=0.6529), - PhEvalGeneResult(gene_symbol="OR14J1", gene_identifier="ENSG00000204695", score=0.6529), - PhEvalGeneResult(gene_symbol="PAGE1", gene_identifier="ENSG00000068985", score=0.5235), - ] - cls.sorted_variant_result = [ - PhEvalVariantResult( - chromosome="X", start=93473023, end=93473024, ref="A", alt="G", score=0.1245 - ), - PhEvalVariantResult( - chromosome="8", start=532356, end=532357, ref="A", alt="C", score=0.4578 - ), - PhEvalVariantResult( - chromosome="5", - start=23457444233, - end=23457444234, - ref="A", - alt="C", - score=0.9348, - ), - PhEvalVariantResult( - chromosome="12", start=12754332, end=12754333, ref="T", alt="G", score=0.9999 - ), - ] - cls.sorted_disease_result = pheval_disease_result + cls.gene_result_ranker = ResultRanker( + [ + PhEvalGeneResult( + gene_symbol="MAP3K14", gene_identifier="ENSG00000006062", score=0.9234 + ), + PhEvalGeneResult( + gene_symbol="A4GNT", gene_identifier="ENSG00000118017", score=0.6529 + ), + PhEvalGeneResult( + gene_symbol="OR14J1", gene_identifier="ENSG00000204695", score=0.6529 + ), + PhEvalGeneResult( + gene_symbol="PAGE1", gene_identifier="ENSG00000068985", score=0.5235 + ), + ], + SortOrder.DESCENDING, + ) + cls.variant_result_ranker = ResultRanker( + [ + PhEvalVariantResult( + chromosome="X", start=93473023, end=93473024, ref="A", alt="G", score=0.1245 + ), + PhEvalVariantResult( + chromosome="8", start=532356, end=532357, ref="A", alt="C", score=0.4578 + ), + PhEvalVariantResult( + chromosome="5", + start=23457444233, + end=23457444234, + ref="A", + alt="C", + score=0.9999, + ), + PhEvalVariantResult( + chromosome="12", start=12754332, end=12754333, ref="T", alt="G", score=0.9999 + ), + ], + sort_order=SortOrder.ASCENDING, + ) + cls.variant_result_ranker_grouping_id = ResultRanker( + [ + PhEvalVariantResult( + chromosome="X", + start=93473023, + end=93473024, + ref="A", + alt="G", + score=0.1245, + grouping_id="4567", + ), + PhEvalVariantResult( + chromosome="8", + start=532356, + end=532357, + ref="A", + alt="C", + score=0.4578, + grouping_id="789", + ), + PhEvalVariantResult( + chromosome="5", + start=23457444233, + end=23457444234, + ref="A", + alt="C", + score=0.9999, + grouping_id="12345", + ), + PhEvalVariantResult( + chromosome="12", + start=12754332, + end=12754333, + ref="T", + alt="G", + score=0.9999, + grouping_id="12345", + ), + ], + sort_order=SortOrder.DESCENDING, + ) + cls.disease_result_ranker = ResultRanker( + pheval_disease_result, sort_order=SortOrder.DESCENDING + ) + + def test__has_valid_grouping_id(self): + df = pd.DataFrame({"score": [0.5, 0.7, 0.3], "grouping_id": ["A", "B", "C"]}) + self.assertTrue(self.variant_result_ranker._has_valid_grouping_id(df)) + + def test__has_valid_grouping_id_present_with_none(self): + df = pd.DataFrame({"score": [0.5, 0.7, 0.3], "grouping_id": ["A", None, "C"]}) + self.assertFalse(self.variant_result_ranker._has_valid_grouping_id(df)) + + def test__has_valid_grouping_id_not_present(self): + df = pd.DataFrame({"score": [0.5, 0.7, 0.3]}) + self.assertFalse(self.variant_result_ranker._has_valid_grouping_id(df)) + + def test__rank_with_grouping_id(self): + df = pd.DataFrame( + { + "score": [0.9, 0.9, 0.8, 0.7, 0.7, 0.6, 0.5, 0.5, 0.4], + "grouping_id": ["A", "A", "B", "C", "D", "E", "F", "F", "G"], + } + ) + self.assertTrue( + self.gene_result_ranker._rank_with_grouping_id(df).equals( + pd.DataFrame( + { + "score": [0.9, 0.9, 0.8, 0.7, 0.7, 0.6, 0.5, 0.5, 0.4], + "grouping_id": ["A", "A", "B", "C", "D", "E", "F", "F", "G"], + "min_rank": [1, 1, 2, 4, 3, 5, 6, 6, 7], + "rank": [1, 1, 2, 4, 4, 5, 6, 6, 7], + } + ) + ) + ) + + def test__rank_without_grouping_id(self): + df = pd.DataFrame( + { + "score": [0.9, 0.9, 0.8, 0.7, 0.7, 0.6, 0.5, 0.5, 0.4], + } + ) + self.assertTrue( + self.gene_result_ranker._rank_without_grouping_id(df).equals( + pd.DataFrame( + { + "score": [0.9, 0.9, 0.8, 0.7, 0.7, 0.6, 0.5, 0.5, 0.4], + "rank": [2, 2, 3, 5, 5, 6, 8, 8, 9], + } + ) + ) + ) def test_rank_pheval_results_gene(self): self.assertTrue( - _rank_pheval_result(self.sorted_gene_result, SortOrder.DESCENDING).equals( + self.gene_result_ranker.rank().equals( pd.DataFrame( { "gene_symbol": ["MAP3K14", "A4GNT", "OR14J1", "PAGE1"], @@ -262,7 +366,7 @@ def test_rank_pheval_results_gene(self): "ENSG00000068985", ], "score": [0.9234, 0.6529, 0.6529, 0.5235], - "rank": [1.0, 3.0, 3.0, 4.0], + "rank": [1, 3, 3, 4], } ) ) @@ -270,7 +374,7 @@ def test_rank_pheval_results_gene(self): def test_rank_pheval_results_variant(self): self.assertTrue( - _rank_pheval_result(self.sorted_variant_result, SortOrder.ASCENDING).equals( + self.variant_result_ranker.rank().equals( pd.DataFrame( { "chromosome": ["X", "8", "5", "12"], @@ -278,8 +382,25 @@ def test_rank_pheval_results_variant(self): "end": [93473024, 532357, 23457444234, 12754333], "ref": ["A", "A", "A", "T"], "alt": ["G", "C", "C", "G"], - "score": [0.1245, 0.4578, 0.9348, 0.9999], - "rank": [1.0, 2.0, 3.0, 4.0], + "score": [0.1245, 0.4578, 0.9999, 0.9999], + "rank": [1, 2, 4, 4], + } + ) + ) + ) + + def test_rank_pheval_results_variant_grouping_id(self): + self.assertTrue( + self.variant_result_ranker_grouping_id.rank().equals( + pd.DataFrame( + { + "chromosome": ["X", "8", "5", "12"], + "start": [93473023, 532356, 23457444233, 12754332], + "end": [93473024, 532357, 23457444234, 12754333], + "ref": ["A", "A", "A", "T"], + "alt": ["G", "C", "C", "G"], + "score": [0.1245, 0.4578, 0.9999, 0.9999], + "rank": [3, 2, 1, 1], } ) ) @@ -287,7 +408,7 @@ def test_rank_pheval_results_variant(self): def test_rank_pheval_results_disease(self): self.assertTrue( - _rank_pheval_result(self.sorted_disease_result, SortOrder.DESCENDING).equals( + self.disease_result_ranker.rank().equals( pd.DataFrame( { "disease_name": { @@ -301,7 +422,7 @@ def test_rank_pheval_results_disease(self): 2: "OMIM:614483", }, "score": {0: 4.284, 1: 4.284, 2: -1.871}, - "rank": {0: 2.0, 1: 2.0, 2: 3.0}, + "rank": {0: 2, 1: 2, 2: 3}, } ) ) @@ -322,7 +443,7 @@ def test_create_pheval_result_gene(self): "ENSG00000068985", ], "score": [0.9234, 0.6529, 0.6529, 0.5235], - "rank": [1.0, 3.0, 3.0, 4.0], + "rank": [1, 3, 3, 4], } ) ) @@ -339,7 +460,7 @@ def test_create_pheval_result_variant(self): "ref": ["A", "A", "A", "T"], "alt": ["G", "C", "C", "G"], "score": [0.1245, 0.4578, 0.9348, 0.9999], - "rank": [1.0, 2.0, 3.0, 4.0], + "rank": [1, 2, 3, 4], } ) ) @@ -361,7 +482,7 @@ def test_create_pheval_result_disease(self): 2: "OMIM:614483", }, "score": {0: 4.284, 1: 4.284, 2: -1.871}, - "rank": {0: 2.0, 1: 2.0, 2: 3.0}, + "rank": {0: 2, 1: 2, 2: 3}, } ) )