fix adj. p-value calculation

anders-biostat · Oct 4, 2024 · 730aa7c · 730aa7c
1 parent 5bbee00
commit 730aa7c
Show file tree

Hide file tree

Showing 2 changed files with 133 additions and 3 deletions.
diff --git a/methscan/diff.py b/methscan/diff.py
@@ -62,6 +62,15 @@ def calc_fdr(bools):
             # prevent div by 0 in the rare case that the best hit is from a permutation
             adj_p_val = 1
         adj_p_vals[i] = adj_p_val
+    # iterate through the list from the back to make sure that there are no values
+    # greater than one, and to make sure it's continuously increasing
+    current_min = 1.0
+    for i_rev in np.arange(1, len(bools))[::-1]:
+        p = adj_p_vals[i_rev]
+        if p > current_min:
+            adj_p_vals[i_rev] = current_min
+        elif p < current_min:
+            current_min = p
     return adj_p_vals
 
 
@@ -117,7 +126,7 @@ def calc_welch_tstat(group1, group2, min_cells):
     Calculates the t-statistic according to Welch's t-test for unequal variances.
     Mostly copy-paste from calc_welch_tstat_df, this is ugly but gives a slight edge
     in performance.
-    Returns the t-statistic, df, and the two group sizes.
+    Returns the t-statistic only.
     """
     len_g1 = len(group1)
     if len_g1 < min_cells:

diff --git a/tests/test_diff.py b/tests/test_diff.py
@@ -5,7 +5,7 @@
 from click.testing import CliRunner
 
 from methscan.cli import cli
-from methscan.diff import parse_cell_groups
+from methscan.diff import parse_cell_groups, calc_fdr
 
 
 def test_parse_cell_groups():
@@ -52,7 +52,7 @@ def test_diff_cli(tmp_path):
     assert dmr[1].sum() == 31250980767
     assert dmr[2].sum() == 31252060767
     assert dmr[3].sum() == -165.62208458348474
-    assert dmr[11].sum() == 486.24726241344274
+    assert dmr[11].sum() == 453.1849885684313
     assert len(dmr[dmr[9] == "neuroblast"]) == 284
     assert dmr[0][493] == 9
     assert dmr[2][135] == 58035292
@@ -63,3 +63,124 @@ def test_diff_cli(tmp_path):
     assert np.all(dmr[6] >= 3)
     assert np.all(dmr[7] >= 0) and np.all(dmr[7] <= 1)
     assert np.all(dmr[8] >= 0) and np.all(dmr[8] <= 1)
+
+
+FDR_TEST_DATA = [
+    {
+        "input": [
+            True,
+            True,
+            False,
+        ],
+        "output": {"mean": 0.3333333, "<0.05": 2, "<0.1": 2, "<0.5": 2},
+    },
+    {
+        "input": [
+            False,
+            False,
+            True,
+        ],
+        "output": {"mean": 1, "<0.05": 0, "<0.1": 0, "<0.5": 0},
+    },
+    {
+        "input": [
+            True,
+            True,
+            True,
+            True,
+            True,
+            False,
+            True,
+            True,
+            True,
+            True,
+            True,
+            True,
+            True,
+            True,
+            True,
+            True,
+            True,
+            True,
+            False,
+            True,
+            False,
+            False,
+            False,
+            True,
+            True,
+            False,
+            True,
+            True,
+            False,
+            True,
+            True,
+            False,
+            False,
+            True,
+            True,
+            True,
+            False,
+            True,
+            False,
+            True,
+            False,
+            True,
+            False,
+            False,
+            True,
+            False,
+            False,
+            True,
+            True,
+            False,
+            True,
+            True,
+            True,
+            False,
+            False,
+            True,
+            False,
+            False,
+            True,
+            False,
+            True,
+            False,
+            True,
+            True,
+            True,
+            True,
+            False,
+            False,
+            False,
+            True,
+            False,
+            False,
+            False,
+            True,
+            True,
+            False,
+            True,
+            True,
+            False,
+            True,
+            False,
+            False,
+            False,
+            False,
+            True,
+        ],
+        "output": {"mean": 0.5335206, "<0.05": 5, "<0.1": 18, "<0.5": 36},
+    },
+]
+
+
+def test_calc_fdr():
+    for test_d in FDR_TEST_DATA:
+        input_bools = np.array(test_d["input"])
+        outputs = test_d["output"]
+        adj_p_vals = calc_fdr(input_bools)
+        assert np.sum(adj_p_vals < 0.05) == outputs["<0.05"], "n hits <0.05"
+        assert np.sum(adj_p_vals < 0.1) == outputs["<0.1"], "n hits <0.1"
+        assert np.sum(adj_p_vals < 0.5) == outputs["<0.5"], "n hits <0.5"
+        assert round(np.mean(adj_p_vals), 7) == outputs["mean"], "mean adj. p"