monarch-initiative · yaseminbridges · Nov 5, 2024 · Nov 5, 2024 · Nov 5, 2024 · Nov 6, 2024
diff --git a/docs/developing_a_pheval_plugin.md b/docs/developing_a_pheval_plugin.md
@@ -242,6 +242,7 @@ The dataclasses representing essential information extracted from tool-specific
       show_source: true
 ---
 
+For variant prioritisation results the `grouping_id` parameter is designed to handle compound heterozygous variants in ranking. Compound heterozygosity occurs when two or more variants, inherited together, contribute to a phenotype. For this purpose, variants that are part of the same compound heterozygous group (e.g., within the same gene) should be assigned the same `grouping_id`. This ensures they are ranked as a single entity, preserving their combined significance. Variants that are not part of any compound heterozygous group should each have a unique `grouping_id`. This approach prevents any unintended overlap in ranking and ensures that each group or individual variant is accurately represented.
 ::: src.pheval.post_processing.post_processing.PhEvalVariantResult
     handler: python
     options:

diff --git a/src/pheval/post_processing/post_processing.py b/src/pheval/post_processing/post_processing.py
@@ -1,6 +1,6 @@
 import logging
 import operator
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from enum import Enum
 from pathlib import Path
 from typing import List, Union
@@ -96,6 +96,7 @@ class PhEvalVariantResult(PhEvalResult):
     ref: str
     alt: str
     score: float
+    grouping_id: str = field(default=None)
 
 
 @dataclass
@@ -105,7 +106,7 @@ class RankedPhEvalVariantResult(PhEvalVariantResult):
         rank (int): The rank for the result entry
     """
 
-    rank: int
+    rank: int = 0
 
     @staticmethod
     def from_variant_result(pheval_variant_result: PhEvalVariantResult, rank: int):
@@ -228,26 +229,57 @@ def sort_pheval_results(self) -> [PhEvalResult]:
         )
 
 
-def _rank_pheval_result(pheval_result: [PhEvalResult], sort_order: SortOrder) -> pd.DataFrame:
-    """
-    Rank PhEval results post-processed from tool-specific output, managing tied scores (ex aequo)
+class ResultRanker:
+    def __init__(self, pheval_result: List[PhEvalResult], sort_order: SortOrder):
+        """
+        Initialise the PhEvalRanker.
+        Args:
+            pheval_result (List[PhEvalResult]): PhEval results to rank.
+            sort_order (SortOrder): Sorting order based on which ranking is performed.
+        """
+        self.pheval_result = pheval_result
+        self.sort_order = sort_order
+        self.ascending = sort_order == SortOrder.ASCENDING
 
-    Args:
-        pheval_result ([PhEvalResult]): PhEval results obtained from tool-specific output
-        sort_order (SortOrder): Sorting order based on which ranking is performed
+    def rank(self) -> pd.DataFrame:
+        """
+        Rank PhEval results, managing tied scores (ex aequo) and handling grouping_id if present.
 
-    Returns:
-        pd.DataFrame : Ranked PhEval results with tied scores managed
+        Returns:
+            pd.DataFrame : Ranked PhEval results with tied scores managed.
+        """
+        pheval_result_df = pd.DataFrame([data.__dict__ for data in self.pheval_result])
 
-    Raises:
-        ValueError: If an incompatible PhEval result type is encountered
-    """
-    pheval_result_df = pd.DataFrame([data.__dict__ for data in pheval_result])
-    if sort_order == SortOrder.ASCENDING:
-        pheval_result_df["rank"] = pheval_result_df["score"].rank(method="max", ascending=True)
-    elif sort_order == SortOrder.DESCENDING:
-        pheval_result_df["rank"] = pheval_result_df["score"].rank(method="max", ascending=False)
-    return pheval_result_df
+        if self._has_valid_grouping_id(pheval_result_df):
+            pheval_result_df = self._rank_with_grouping_id(pheval_result_df)
+        else:
+            pheval_result_df = self._rank_without_grouping_id(pheval_result_df)
+        return pheval_result_df.drop(columns=["min_rank", "grouping_id"], errors="ignore")
+
+    @staticmethod
+    def _has_valid_grouping_id(pheval_result_df: pd.DataFrame) -> bool:
+        """Check if grouping_id exists and has no None values."""
+        return (
+            "grouping_id" in pheval_result_df.columns
+            and not pheval_result_df["grouping_id"].isnull().any()
+        )
+
+    def _rank_with_grouping_id(self, pheval_result_df: pd.DataFrame) -> pd.DataFrame:
+        """Apply ranking when grouping_id is present and has no None values."""
+        pheval_result_df["min_rank"] = (
+            pheval_result_df.groupby(["score", "grouping_id"])
+            .ngroup()
+            .rank(method="dense", ascending=self.ascending)
+        ).astype(int)
+        pheval_result_df["rank"] = pheval_result_df.groupby("score")["min_rank"].transform("max")
+        return pheval_result_df
+
+    def _rank_without_grouping_id(self, pheval_result_df: pd.DataFrame) -> pd.DataFrame:
+        """Apply ranking without using grouping_id."""
+        pheval_result_df["rank"] = (
+            pheval_result_df["score"].rank(method="max", ascending=self.ascending).astype(int)
+        )
+        return pheval_result_df
 
 
 def _return_sort_order(sort_order_str: str) -> SortOrder:
@@ -282,7 +314,7 @@ def _create_pheval_result(pheval_result: [PhEvalResult], sort_order_str: str) ->
     """
     sort_order = _return_sort_order(sort_order_str)
     sorted_pheval_result = ResultSorter(pheval_result, sort_order).sort_pheval_results()
-    return _rank_pheval_result(sorted_pheval_result, sort_order)
+    return ResultRanker(sorted_pheval_result, sort_order).rank()
 
 
 def _write_pheval_gene_result(

diff --git a/tests/test_post_processing.py b/tests/test_post_processing.py
@@ -9,10 +9,10 @@
     RankedPhEvalDiseaseResult,
     RankedPhEvalGeneResult,
     RankedPhEvalVariantResult,
+    ResultRanker,
     ResultSorter,
     SortOrder,
     _create_pheval_result,
-    _rank_pheval_result,
     _return_sort_order,
     calculate_end_pos,
 )
@@ -217,41 +217,145 @@ def test_sort_pheval_results_pvalue(self):
         )
 
 
-class TestRankPhEvalResults(unittest.TestCase):
+class TestResultRanker(unittest.TestCase):
     @classmethod
     def setUpClass(cls) -> None:
-        cls.sorted_gene_result = [
-            PhEvalGeneResult(
-                gene_symbol="MAP3K14", gene_identifier="ENSG00000006062", score=0.9234
-            ),
-            PhEvalGeneResult(gene_symbol="A4GNT", gene_identifier="ENSG00000118017", score=0.6529),
-            PhEvalGeneResult(gene_symbol="OR14J1", gene_identifier="ENSG00000204695", score=0.6529),
-            PhEvalGeneResult(gene_symbol="PAGE1", gene_identifier="ENSG00000068985", score=0.5235),
-        ]
-        cls.sorted_variant_result = [
-            PhEvalVariantResult(
-                chromosome="X", start=93473023, end=93473024, ref="A", alt="G", score=0.1245
-            ),
-            PhEvalVariantResult(
-                chromosome="8", start=532356, end=532357, ref="A", alt="C", score=0.4578
-            ),
-            PhEvalVariantResult(
-                chromosome="5",
-                start=23457444233,
-                end=23457444234,
-                ref="A",
-                alt="C",
-                score=0.9348,
-            ),
-            PhEvalVariantResult(
-                chromosome="12", start=12754332, end=12754333, ref="T", alt="G", score=0.9999
-            ),
-        ]
-        cls.sorted_disease_result = pheval_disease_result
+        cls.gene_result_ranker = ResultRanker(
+            [
+                PhEvalGeneResult(
+                    gene_symbol="MAP3K14", gene_identifier="ENSG00000006062", score=0.9234
+                ),
+                PhEvalGeneResult(
+                    gene_symbol="A4GNT", gene_identifier="ENSG00000118017", score=0.6529
+                ),
+                PhEvalGeneResult(
+                    gene_symbol="OR14J1", gene_identifier="ENSG00000204695", score=0.6529
+                ),
+                PhEvalGeneResult(
+                    gene_symbol="PAGE1", gene_identifier="ENSG00000068985", score=0.5235
+                ),
+            ],
+            SortOrder.DESCENDING,
+        )
+        cls.variant_result_ranker = ResultRanker(
+            [
+                PhEvalVariantResult(
+                    chromosome="X", start=93473023, end=93473024, ref="A", alt="G", score=0.1245
+                ),
+                PhEvalVariantResult(
+                    chromosome="8", start=532356, end=532357, ref="A", alt="C", score=0.4578
+                ),
+                PhEvalVariantResult(
+                    chromosome="5",
+                    start=23457444233,
+                    end=23457444234,
+                    ref="A",
+                    alt="C",
+                    score=0.9999,
+                ),
+                PhEvalVariantResult(
+                    chromosome="12", start=12754332, end=12754333, ref="T", alt="G", score=0.9999
+                ),
+            ],
+            sort_order=SortOrder.ASCENDING,
+        )
+        cls.variant_result_ranker_grouping_id = ResultRanker(
+            [
+                PhEvalVariantResult(
+                    chromosome="X",
+                    start=93473023,
+                    end=93473024,
+                    ref="A",
+                    alt="G",
+                    score=0.1245,
+                    grouping_id="4567",
+                ),
+                PhEvalVariantResult(
+                    chromosome="8",
+                    start=532356,
+                    end=532357,
+                    ref="A",
+                    alt="C",
+                    score=0.4578,
+                    grouping_id="789",
+                ),
+                PhEvalVariantResult(
+                    chromosome="5",
+                    start=23457444233,
+                    end=23457444234,
+                    ref="A",
+                    alt="C",
+                    score=0.9999,
+                    grouping_id="12345",
+                ),
+                PhEvalVariantResult(
+                    chromosome="12",
+                    start=12754332,
+                    end=12754333,
+                    ref="T",
+                    alt="G",
+                    score=0.9999,
+                    grouping_id="12345",
+                ),
+            ],
+            sort_order=SortOrder.DESCENDING,
+        )
+        cls.disease_result_ranker = ResultRanker(
+            pheval_disease_result, sort_order=SortOrder.DESCENDING
+        )
+
+    def test__has_valid_grouping_id(self):
+        df = pd.DataFrame({"score": [0.5, 0.7, 0.3], "grouping_id": ["A", "B", "C"]})
+        self.assertTrue(self.variant_result_ranker._has_valid_grouping_id(df))
+
+    def test__has_valid_grouping_id_present_with_none(self):
+        df = pd.DataFrame({"score": [0.5, 0.7, 0.3], "grouping_id": ["A", None, "C"]})
+        self.assertFalse(self.variant_result_ranker._has_valid_grouping_id(df))
+
+    def test__has_valid_grouping_id_not_present(self):
+        df = pd.DataFrame({"score": [0.5, 0.7, 0.3]})
+        self.assertFalse(self.variant_result_ranker._has_valid_grouping_id(df))
+
+    def test__rank_with_grouping_id(self):
+        df = pd.DataFrame(
+            {
+                "score": [0.9, 0.9, 0.8, 0.7, 0.7, 0.6, 0.5, 0.5, 0.4],
+                "grouping_id": ["A", "A", "B", "C", "D", "E", "F", "F", "G"],
+            }
+        )
+        self.assertTrue(
+            self.gene_result_ranker._rank_with_grouping_id(df).equals(
+                pd.DataFrame(
+                    {
+                        "score": [0.9, 0.9, 0.8, 0.7, 0.7, 0.6, 0.5, 0.5, 0.4],
+                        "grouping_id": ["A", "A", "B", "C", "D", "E", "F", "F", "G"],
+                        "min_rank": [1, 1, 2, 4, 3, 5, 6, 6, 7],
+                        "rank": [1, 1, 2, 4, 4, 5, 6, 6, 7],
+                    }
+                )
+            )
+        )
+
+    def test__rank_without_grouping_id(self):
+        df = pd.DataFrame(
+            {
+                "score": [0.9, 0.9, 0.8, 0.7, 0.7, 0.6, 0.5, 0.5, 0.4],
+            }
+        )
+        self.assertTrue(
+            self.gene_result_ranker._rank_without_grouping_id(df).equals(
+                pd.DataFrame(
+                    {
+                        "score": [0.9, 0.9, 0.8, 0.7, 0.7, 0.6, 0.5, 0.5, 0.4],
+                        "rank": [2, 2, 3, 5, 5, 6, 8, 8, 9],
+                    }
+                )
+            )
+        )
 
     def test_rank_pheval_results_gene(self):
         self.assertTrue(
-            _rank_pheval_result(self.sorted_gene_result, SortOrder.DESCENDING).equals(
+            self.gene_result_ranker.rank().equals(
                 pd.DataFrame(
                     {
                         "gene_symbol": ["MAP3K14", "A4GNT", "OR14J1", "PAGE1"],
@@ -262,32 +366,49 @@ def test_rank_pheval_results_gene(self):
                             "ENSG00000068985",
                         ],
                         "score": [0.9234, 0.6529, 0.6529, 0.5235],
-                        "rank": [1.0, 3.0, 3.0, 4.0],
+                        "rank": [1, 3, 3, 4],
                     }
                 )
             )
         )
 
     def test_rank_pheval_results_variant(self):
         self.assertTrue(
-            _rank_pheval_result(self.sorted_variant_result, SortOrder.ASCENDING).equals(
+            self.variant_result_ranker.rank().equals(
                 pd.DataFrame(
                     {
                         "chromosome": ["X", "8", "5", "12"],
                         "start": [93473023, 532356, 23457444233, 12754332],
                         "end": [93473024, 532357, 23457444234, 12754333],
                         "ref": ["A", "A", "A", "T"],
                         "alt": ["G", "C", "C", "G"],
-                        "score": [0.1245, 0.4578, 0.9348, 0.9999],
-                        "rank": [1.0, 2.0, 3.0, 4.0],
+                        "score": [0.1245, 0.4578, 0.9999, 0.9999],
+                        "rank": [1, 2, 4, 4],
+                    }
+                )
+            )
+        )
+
+    def test_rank_pheval_results_variant_grouping_id(self):
+        self.assertTrue(
+            self.variant_result_ranker_grouping_id.rank().equals(
+                pd.DataFrame(
+                    {
+                        "chromosome": ["X", "8", "5", "12"],
+                        "start": [93473023, 532356, 23457444233, 12754332],
+                        "end": [93473024, 532357, 23457444234, 12754333],
+                        "ref": ["A", "A", "A", "T"],
+                        "alt": ["G", "C", "C", "G"],
+                        "score": [0.1245, 0.4578, 0.9999, 0.9999],
+                        "rank": [3, 2, 1, 1],
                     }
                 )
             )
         )
 
     def test_rank_pheval_results_disease(self):
         self.assertTrue(
-            _rank_pheval_result(self.sorted_disease_result, SortOrder.DESCENDING).equals(
+            self.disease_result_ranker.rank().equals(
                 pd.DataFrame(
                     {
                         "disease_name": {
@@ -301,7 +422,7 @@ def test_rank_pheval_results_disease(self):
                             2: "OMIM:614483",
                         },
                         "score": {0: 4.284, 1: 4.284, 2: -1.871},
-                        "rank": {0: 2.0, 1: 2.0, 2: 3.0},
+                        "rank": {0: 2, 1: 2, 2: 3},
                     }
                 )
             )
@@ -322,7 +443,7 @@ def test_create_pheval_result_gene(self):
                             "ENSG00000068985",
                         ],
                         "score": [0.9234, 0.6529, 0.6529, 0.5235],
-                        "rank": [1.0, 3.0, 3.0, 4.0],
+                        "rank": [1, 3, 3, 4],
                     }
                 )
             )
@@ -339,7 +460,7 @@ def test_create_pheval_result_variant(self):
                         "ref": ["A", "A", "A", "T"],
                         "alt": ["G", "C", "C", "G"],
                         "score": [0.1245, 0.4578, 0.9348, 0.9999],
-                        "rank": [1.0, 2.0, 3.0, 4.0],
+                        "rank": [1, 2, 3, 4],
                     }
                 )
             )
@@ -361,7 +482,7 @@ def test_create_pheval_result_disease(self):
                             2: "OMIM:614483",
                         },
                         "score": {0: 4.284, 1: 4.284, 2: -1.871},
-                        "rank": {0: 2.0, 1: 2.0, 2: 3.0},
+                        "rank": {0: 2, 1: 2, 2: 3},
                     }
                 )
             )