Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

363 account for compound het variants #368

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/developing_a_pheval_plugin.md
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ The dataclasses representing essential information extracted from tool-specific
show_source: true
---

For variant prioritisation results the `grouping_id` parameter is designed to handle compound heterozygous variants in ranking. Compound heterozygosity occurs when two or more variants, inherited together, contribute to a phenotype. For this purpose, variants that are part of the same compound heterozygous group (e.g., within the same gene) should be assigned the same `grouping_id`. This ensures they are ranked as a single entity, preserving their combined significance. Variants that are not part of any compound heterozygous group should each have a unique `grouping_id`. This approach prevents any unintended overlap in ranking and ensures that each group or individual variant is accurately represented.
::: src.pheval.post_processing.post_processing.PhEvalVariantResult
handler: python
options:
Expand Down
72 changes: 52 additions & 20 deletions src/pheval/post_processing/post_processing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
import operator
from dataclasses import dataclass
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from typing import List, Union
Expand Down Expand Up @@ -96,6 +96,7 @@ class PhEvalVariantResult(PhEvalResult):
ref: str
alt: str
score: float
grouping_id: str = field(default=None)


@dataclass
Expand All @@ -105,7 +106,7 @@ class RankedPhEvalVariantResult(PhEvalVariantResult):
rank (int): The rank for the result entry
"""

rank: int
rank: int = 0

@staticmethod
def from_variant_result(pheval_variant_result: PhEvalVariantResult, rank: int):
Expand Down Expand Up @@ -228,26 +229,57 @@ def sort_pheval_results(self) -> [PhEvalResult]:
)


def _rank_pheval_result(pheval_result: [PhEvalResult], sort_order: SortOrder) -> pd.DataFrame:
"""
Rank PhEval results post-processed from tool-specific output, managing tied scores (ex aequo)
class ResultRanker:
def __init__(self, pheval_result: List[PhEvalResult], sort_order: SortOrder):
"""
Initialise the PhEvalRanker.
Args:
pheval_result (List[PhEvalResult]): PhEval results to rank.
sort_order (SortOrder): Sorting order based on which ranking is performed.
"""
self.pheval_result = pheval_result
self.sort_order = sort_order
self.ascending = sort_order == SortOrder.ASCENDING

Args:
pheval_result ([PhEvalResult]): PhEval results obtained from tool-specific output
sort_order (SortOrder): Sorting order based on which ranking is performed
def rank(self) -> pd.DataFrame:
"""
Rank PhEval results, managing tied scores (ex aequo) and handling grouping_id if present.

Returns:
pd.DataFrame : Ranked PhEval results with tied scores managed
Returns:
pd.DataFrame : Ranked PhEval results with tied scores managed.
"""
pheval_result_df = pd.DataFrame([data.__dict__ for data in self.pheval_result])

Raises:
ValueError: If an incompatible PhEval result type is encountered
"""
pheval_result_df = pd.DataFrame([data.__dict__ for data in pheval_result])
if sort_order == SortOrder.ASCENDING:
pheval_result_df["rank"] = pheval_result_df["score"].rank(method="max", ascending=True)
elif sort_order == SortOrder.DESCENDING:
pheval_result_df["rank"] = pheval_result_df["score"].rank(method="max", ascending=False)
return pheval_result_df
if self._has_valid_grouping_id(pheval_result_df):
pheval_result_df = self._rank_with_grouping_id(pheval_result_df)
else:
pheval_result_df = self._rank_without_grouping_id(pheval_result_df)
return pheval_result_df.drop(columns=["min_rank", "grouping_id"], errors="ignore")

@staticmethod
def _has_valid_grouping_id(pheval_result_df: pd.DataFrame) -> bool:
"""Check if grouping_id exists and has no None values."""
return (
"grouping_id" in pheval_result_df.columns
and not pheval_result_df["grouping_id"].isnull().any()
)

def _rank_with_grouping_id(self, pheval_result_df: pd.DataFrame) -> pd.DataFrame:
"""Apply ranking when grouping_id is present and has no None values."""
pheval_result_df["min_rank"] = (
pheval_result_df.groupby(["score", "grouping_id"])
.ngroup()
.rank(method="dense", ascending=self.ascending)
).astype(int)
pheval_result_df["rank"] = pheval_result_df.groupby("score")["min_rank"].transform("max")
return pheval_result_df

def _rank_without_grouping_id(self, pheval_result_df: pd.DataFrame) -> pd.DataFrame:
"""Apply ranking without using grouping_id."""
pheval_result_df["rank"] = (
pheval_result_df["score"].rank(method="max", ascending=self.ascending).astype(int)
)
return pheval_result_df


def _return_sort_order(sort_order_str: str) -> SortOrder:
Expand Down Expand Up @@ -282,7 +314,7 @@ def _create_pheval_result(pheval_result: [PhEvalResult], sort_order_str: str) ->
"""
sort_order = _return_sort_order(sort_order_str)
sorted_pheval_result = ResultSorter(pheval_result, sort_order).sort_pheval_results()
return _rank_pheval_result(sorted_pheval_result, sort_order)
return ResultRanker(sorted_pheval_result, sort_order).rank()


def _write_pheval_gene_result(
Expand Down
201 changes: 161 additions & 40 deletions tests/test_post_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
RankedPhEvalDiseaseResult,
RankedPhEvalGeneResult,
RankedPhEvalVariantResult,
ResultRanker,
ResultSorter,
SortOrder,
_create_pheval_result,
_rank_pheval_result,
_return_sort_order,
calculate_end_pos,
)
Expand Down Expand Up @@ -217,41 +217,145 @@ def test_sort_pheval_results_pvalue(self):
)


class TestRankPhEvalResults(unittest.TestCase):
class TestResultRanker(unittest.TestCase):
@classmethod
def setUpClass(cls) -> None:
cls.sorted_gene_result = [
PhEvalGeneResult(
gene_symbol="MAP3K14", gene_identifier="ENSG00000006062", score=0.9234
),
PhEvalGeneResult(gene_symbol="A4GNT", gene_identifier="ENSG00000118017", score=0.6529),
PhEvalGeneResult(gene_symbol="OR14J1", gene_identifier="ENSG00000204695", score=0.6529),
PhEvalGeneResult(gene_symbol="PAGE1", gene_identifier="ENSG00000068985", score=0.5235),
]
cls.sorted_variant_result = [
PhEvalVariantResult(
chromosome="X", start=93473023, end=93473024, ref="A", alt="G", score=0.1245
),
PhEvalVariantResult(
chromosome="8", start=532356, end=532357, ref="A", alt="C", score=0.4578
),
PhEvalVariantResult(
chromosome="5",
start=23457444233,
end=23457444234,
ref="A",
alt="C",
score=0.9348,
),
PhEvalVariantResult(
chromosome="12", start=12754332, end=12754333, ref="T", alt="G", score=0.9999
),
]
cls.sorted_disease_result = pheval_disease_result
cls.gene_result_ranker = ResultRanker(
[
PhEvalGeneResult(
gene_symbol="MAP3K14", gene_identifier="ENSG00000006062", score=0.9234
),
PhEvalGeneResult(
gene_symbol="A4GNT", gene_identifier="ENSG00000118017", score=0.6529
),
PhEvalGeneResult(
gene_symbol="OR14J1", gene_identifier="ENSG00000204695", score=0.6529
),
PhEvalGeneResult(
gene_symbol="PAGE1", gene_identifier="ENSG00000068985", score=0.5235
),
],
SortOrder.DESCENDING,
)
cls.variant_result_ranker = ResultRanker(
[
PhEvalVariantResult(
chromosome="X", start=93473023, end=93473024, ref="A", alt="G", score=0.1245
),
PhEvalVariantResult(
chromosome="8", start=532356, end=532357, ref="A", alt="C", score=0.4578
),
PhEvalVariantResult(
chromosome="5",
start=23457444233,
end=23457444234,
ref="A",
alt="C",
score=0.9999,
),
PhEvalVariantResult(
chromosome="12", start=12754332, end=12754333, ref="T", alt="G", score=0.9999
),
],
sort_order=SortOrder.ASCENDING,
)
cls.variant_result_ranker_grouping_id = ResultRanker(
[
PhEvalVariantResult(
chromosome="X",
start=93473023,
end=93473024,
ref="A",
alt="G",
score=0.1245,
grouping_id="4567",
),
PhEvalVariantResult(
chromosome="8",
start=532356,
end=532357,
ref="A",
alt="C",
score=0.4578,
grouping_id="789",
),
PhEvalVariantResult(
chromosome="5",
start=23457444233,
end=23457444234,
ref="A",
alt="C",
score=0.9999,
grouping_id="12345",
),
PhEvalVariantResult(
chromosome="12",
start=12754332,
end=12754333,
ref="T",
alt="G",
score=0.9999,
grouping_id="12345",
),
],
sort_order=SortOrder.DESCENDING,
)
cls.disease_result_ranker = ResultRanker(
pheval_disease_result, sort_order=SortOrder.DESCENDING
)

def test__has_valid_grouping_id(self):
df = pd.DataFrame({"score": [0.5, 0.7, 0.3], "grouping_id": ["A", "B", "C"]})
self.assertTrue(self.variant_result_ranker._has_valid_grouping_id(df))

def test__has_valid_grouping_id_present_with_none(self):
df = pd.DataFrame({"score": [0.5, 0.7, 0.3], "grouping_id": ["A", None, "C"]})
self.assertFalse(self.variant_result_ranker._has_valid_grouping_id(df))

def test__has_valid_grouping_id_not_present(self):
df = pd.DataFrame({"score": [0.5, 0.7, 0.3]})
self.assertFalse(self.variant_result_ranker._has_valid_grouping_id(df))

def test__rank_with_grouping_id(self):
df = pd.DataFrame(
{
"score": [0.9, 0.9, 0.8, 0.7, 0.7, 0.6, 0.5, 0.5, 0.4],
"grouping_id": ["A", "A", "B", "C", "D", "E", "F", "F", "G"],
}
)
self.assertTrue(
self.gene_result_ranker._rank_with_grouping_id(df).equals(
pd.DataFrame(
{
"score": [0.9, 0.9, 0.8, 0.7, 0.7, 0.6, 0.5, 0.5, 0.4],
"grouping_id": ["A", "A", "B", "C", "D", "E", "F", "F", "G"],
"min_rank": [1, 1, 2, 4, 3, 5, 6, 6, 7],
"rank": [1, 1, 2, 4, 4, 5, 6, 6, 7],
}
)
)
)

def test__rank_without_grouping_id(self):
df = pd.DataFrame(
{
"score": [0.9, 0.9, 0.8, 0.7, 0.7, 0.6, 0.5, 0.5, 0.4],
}
)
self.assertTrue(
self.gene_result_ranker._rank_without_grouping_id(df).equals(
pd.DataFrame(
{
"score": [0.9, 0.9, 0.8, 0.7, 0.7, 0.6, 0.5, 0.5, 0.4],
"rank": [2, 2, 3, 5, 5, 6, 8, 8, 9],
}
)
)
)

def test_rank_pheval_results_gene(self):
self.assertTrue(
_rank_pheval_result(self.sorted_gene_result, SortOrder.DESCENDING).equals(
self.gene_result_ranker.rank().equals(
pd.DataFrame(
{
"gene_symbol": ["MAP3K14", "A4GNT", "OR14J1", "PAGE1"],
Expand All @@ -262,32 +366,49 @@ def test_rank_pheval_results_gene(self):
"ENSG00000068985",
],
"score": [0.9234, 0.6529, 0.6529, 0.5235],
"rank": [1.0, 3.0, 3.0, 4.0],
"rank": [1, 3, 3, 4],
}
)
)
)

def test_rank_pheval_results_variant(self):
self.assertTrue(
_rank_pheval_result(self.sorted_variant_result, SortOrder.ASCENDING).equals(
self.variant_result_ranker.rank().equals(
pd.DataFrame(
{
"chromosome": ["X", "8", "5", "12"],
"start": [93473023, 532356, 23457444233, 12754332],
"end": [93473024, 532357, 23457444234, 12754333],
"ref": ["A", "A", "A", "T"],
"alt": ["G", "C", "C", "G"],
"score": [0.1245, 0.4578, 0.9348, 0.9999],
"rank": [1.0, 2.0, 3.0, 4.0],
"score": [0.1245, 0.4578, 0.9999, 0.9999],
"rank": [1, 2, 4, 4],
}
)
)
)

def test_rank_pheval_results_variant_grouping_id(self):
self.assertTrue(
self.variant_result_ranker_grouping_id.rank().equals(
pd.DataFrame(
{
"chromosome": ["X", "8", "5", "12"],
"start": [93473023, 532356, 23457444233, 12754332],
"end": [93473024, 532357, 23457444234, 12754333],
"ref": ["A", "A", "A", "T"],
"alt": ["G", "C", "C", "G"],
"score": [0.1245, 0.4578, 0.9999, 0.9999],
"rank": [3, 2, 1, 1],
}
)
)
)

def test_rank_pheval_results_disease(self):
self.assertTrue(
_rank_pheval_result(self.sorted_disease_result, SortOrder.DESCENDING).equals(
self.disease_result_ranker.rank().equals(
pd.DataFrame(
{
"disease_name": {
Expand All @@ -301,7 +422,7 @@ def test_rank_pheval_results_disease(self):
2: "OMIM:614483",
},
"score": {0: 4.284, 1: 4.284, 2: -1.871},
"rank": {0: 2.0, 1: 2.0, 2: 3.0},
"rank": {0: 2, 1: 2, 2: 3},
}
)
)
Expand All @@ -322,7 +443,7 @@ def test_create_pheval_result_gene(self):
"ENSG00000068985",
],
"score": [0.9234, 0.6529, 0.6529, 0.5235],
"rank": [1.0, 3.0, 3.0, 4.0],
"rank": [1, 3, 3, 4],
}
)
)
Expand All @@ -339,7 +460,7 @@ def test_create_pheval_result_variant(self):
"ref": ["A", "A", "A", "T"],
"alt": ["G", "C", "C", "G"],
"score": [0.1245, 0.4578, 0.9348, 0.9999],
"rank": [1.0, 2.0, 3.0, 4.0],
"rank": [1, 2, 3, 4],
}
)
)
Expand All @@ -361,7 +482,7 @@ def test_create_pheval_result_disease(self):
2: "OMIM:614483",
},
"score": {0: 4.284, 1: 4.284, 2: -1.871},
"rank": {0: 2.0, 1: 2.0, 2: 3.0},
"rank": {0: 2, 1: 2, 2: 3},
}
)
)
Expand Down
Loading