documentation

simonsobs · Dec 6, 2023 · ce0374d · ce0374d
1 parent a944ac5
commit ce0374d
Show file tree

Hide file tree

Showing 2 changed files with 147 additions and 13 deletions.
diff --git a/project/data_analysis/python/null_tests/custom_nulls/correlated_pte_subsamples.ipynb b/project/data_analysis/python/null_tests/custom_nulls/correlated_pte_subsamples.ipynb
@@ -9,6 +9,13 @@
     "Connected to cmb-della8 (Python 3.8.3)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We start by evaluating a KS test for the raw PTE distributions. We expect that by not accounting for correlations induced by the linear dependence of null tests, these KS test may deliver low PTE values."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 9,
@@ -23,6 +30,27 @@
     "from random import sample, seed\n",
     "\n",
     "def get_null_list(surveys, arrays, spectra):\n",
+    "    \"\"\"For a given set of surveys, arrays, and polarization cross spectra,\n",
+    "    form all the possible null tests. A null test is a combination of a \n",
+    "    pol cross spectra identifier, the two maps labeling the first leg of \n",
+    "    the null test, and the two maps labeling the second leg of the null test.\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    surveys : list\n",
+    "        All the surveys.\n",
+    "    arrays : list\n",
+    "        All the arrays for each survey\n",
+    "    spectra : list\n",
+    "        All the pol cross spectra for each array of each survey.\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    list of list\n",
+    "        All the possible null tests, e.g one of which may be:\n",
+    "        \n",
+    "        ['TE', ('pa4_f220_el1', 'pa4_f220_el1'), ('pa4_f220_el1', 'pa4_f220_el2')]\n",
+    "    \"\"\"\n",
     "    map_set_list = []\n",
     "    for sv in surveys:\n",
     "        for ar in arrays:\n",
@@ -37,6 +65,18 @@
     "    return null_list\n",
     "\n",
     "def pte_histo(pte_list, null_test, n_bins):\n",
+    "    \"\"\"Plot the PTE histogram for null test PTEs from a list of\n",
+    "    PTEs. \n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    pte_list : list\n",
+    "        PTE values of all the null tests.\n",
+    "    null_test : str\n",
+    "        Description of the null tests (will be plot title).\n",
+    "    n_bins : int\n",
+    "        Number of bins in the histogram.\n",
+    "    \"\"\"\n",
     "    n_samples = len(pte_list)\n",
     "\n",
     "    bins = np.linspace(0, 1, n_bins + 1)\n",
@@ -79,6 +119,10 @@
     }
    ],
    "source": [
+    "# First let's get the \"naive\" null test results: this is where we take all the raw results,\n",
+    "# and get their joint histogram. We will evaluate a single KS test on this single set of PTE\n",
+    "# values.\n",
+    "\n",
     "null_test = \"pwv\"\n",
     "spectra = [\"TT\", \"TE\", \"TB\", \"ET\", \"BT\", \"EE\", \"EB\", \"BE\", \"BB\"]\n",
     "\n",
@@ -102,8 +146,12 @@
     "    n_bins = 20\n",
     "\n",
     "pte_list = []\n",
+    "\n",
+    "# We look at arrays individually\n",
     "for t_ar in test_arrays:\n",
     "    null_list = get_null_list(surveys, t_ar, spectra)\n",
+    "\n",
+    "    # For a given null, we evaluate whether to include it\n",
     "    for null in null_list:\n",
     "        mode, (ms1, ms2), (ms3, ms4) = null\n",
     "\n",
@@ -112,13 +160,17 @@
     "        else:\n",
     "            my_ar = t_ar[0][:8]\n",
     "\n",
+    "        # Skip if exactly equal to another null test\n",
     "        if (ms1 == ms2) & (ms3 == ms4) & (mode in [\"ET\", \"BT\", \"BE\"]) :\n",
     "            # print(f\"skip {ms1}x{ms2}-{ms3}x{ms4} {mode} since it's a doublon of {mode[::-1]}\")\n",
     "            continue\n",
+    "\n",
+    "        # Skip pa4 pol\n",
     "        if (my_ar == \"pa4_f220\" ) & (mode != \"TT\"):\n",
     "            continue\n",
     "\n",
-    "        res_dict = pickle.load(open(f\"/scratch/gpfs/zatkins/data/simonsobs/PSpipe/project/dr6v4_unblinding/nulls/{null_test}/null_test_{my_ar}/diff_{mode}_{ms1}x{ms2}_{ms3}x{ms4}.pkl\", \"rb\"))\n",
+    "        # Otherwise, grab its PTE and plot PTEs of all the valid nulls\n",
+    "        res_dict = pickle.load(open(f\"/global/cfs/cdirs/act/data/tlouis/dr6v4/nulls/{null_test}/null_test_{my_ar}/diff_{mode}_{ms1}x{ms2}_{ms3}x{ms4}.pkl\", \"rb\"))\n",
     "\n",
     "        name = res_dict[\"fname\"]\n",
     "        chi2 = res_dict[\"chi2\"]\n",
@@ -148,6 +200,8 @@
     }
    ],
    "source": [
+    "# Repeat above but for inout\n",
+    "\n",
     "null_test = \"inout\"\n",
     "spectra = [\"TT\", \"TE\", \"TB\", \"ET\", \"BT\", \"EE\", \"EB\", \"BE\", \"BB\"]\n",
     "\n",
@@ -217,6 +271,8 @@
     }
    ],
    "source": [
+    "# Repeat above but for elevation\n",
+    "\n",
     "null_test = \"elevation\"\n",
     "spectra = [\"TT\", \"TE\", \"TB\", \"ET\", \"BT\", \"EE\", \"EB\", \"BE\", \"BB\"]\n",
     "\n",
@@ -267,6 +323,15 @@
     "pte_histo(pte_list, null_test, n_bins)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we repeat the above test, but rather than including all null tests in a single PTE distribution, we break up the null tests into separate sets linearly independent subsamples, and evaluate the PTE distribution of each subsample. This is motivated by the following observation (for example):\n",
+    "\n",
+    "Given 2 PWV splits, we can form 3 cross-pwv spectra: pwv1pwv1, pwv1pwv2, and pwv2pwv2. From 3 cross-pwv spectra, we can form 3 total null tests: pwv1pwv1 - pwv1pwv2, pwv1pwv2 - pwv2pwv2, pwv2pwv2 - pwv1pwv1. But when we include all 3 null tests, we have repeated information, since given any 2 of these null tests, the 3rd is a simple sum of the other 2. Thus, we can ask what happens when we only pick 2 null tests, which is the smallest number that contain the full information in the maps. There are 3 ways to do form these linearly independent subsamples, so we rerun the PTE distribution and KS test for each subsample."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
@@ -281,6 +346,10 @@
     "from random import sample, seed\n",
     "\n",
     "def get_null_list(surveys, arrays, spectra):\n",
+    "    \"\"\"Like the function before, but this time also return a list\n",
+    "    of the possible cross-spectra that will be differenced to form\n",
+    "    null tests.\n",
+    "    \"\"\"\n",
     "    map_set_list = []\n",
     "    for sv in surveys:\n",
     "        for ar in arrays:\n",
@@ -297,6 +366,21 @@
     "    return null_list, crosspecs\n",
     "\n",
     "def get_best_null_sets(specs):\n",
+    "    \"\"\"Given a list of N cross-spectra, form all the subsamples of N-1\n",
+    "    pairs, i.e. null tests. In particular, return N-1 pairs such that\n",
+    "    no leg of the pairs is repeated more than twice.\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    specs : list\n",
+    "        List of spectra identifiers.\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    List of two-tuples\n",
+    "        Each tuple contains two elements. Each element is a \n",
+    "        pair of spectra, i.e. a null test.\n",
+    "    \"\"\"\n",
     "    perms = list(permutations(specs, len(specs)))\n",
     "\n",
     "    for s in perms:\n",
@@ -311,7 +395,10 @@
     "\n",
     "    return best_sets\n",
     "\n",
-    "def pte_histo(null_test, list_of_pte_lists, list_of_pte_titles, n_bins):    \n",
+    "def pte_histo(null_test, list_of_pte_lists, list_of_pte_titles, n_bins):\n",
+    "    \"\"\"Like before, but now we will make a plot for each subsample of null tests.\n",
+    "    So, list_of_pte_lists contains the pte_list for each subsample, and list_of_pte_titles\n",
+    "    contains the title for each subsample plot.\"\"\" \n",
     "    ks_test_list = []\n",
     "    for i, pte_list in enumerate(list_of_pte_lists):\n",
     "        plt.figure(figsize=(8,6))\n",
@@ -423,14 +510,19 @@
     }
    ],
    "source": [
+    "# Now let's get the \"subsampled\" null test results: for pwv, there are 3 total \n",
+    "# subsamples of null tests that contain all the map information. Each subsample\n",
+    "# contains 2 out of the 3 possible null tests. We will just plot the PTE distributions\n",
+    "# for all 3 subsamples.\n",
+    "\n",
     "null_test = \"pwv\"\n",
     "spectra = [\"TT\", \"TE\", \"TB\", \"ET\", \"BT\", \"EE\", \"EB\", \"BE\", \"BB\"]\n",
     "\n",
     "if null_test == \"elevation\":\n",
     "    surveys = [\"el1\", \"el2\", \"el3\"]\n",
     "    test_arrays = [[\"pa4_f220\"], [\"pa5_f090\"], [\"pa5_f150\"],  [\"pa6_f090\"], [\"pa6_f150\"]]\n",
     "    n_bins = 60\n",
-    "    n_subsamp = 10\n",
+    "    n_subsamp = 100\n",
     "\n",
     "if null_test == \"pwv\":\n",
     "    surveys = [\"pwv1\", \"pwv2\"]\n",
@@ -448,18 +540,29 @@
     "    n_bins = 20\n",
     "    n_subsamp = 3\n",
     "\n",
+    "# important to initialize list_of_pte_lists this way so that each subsample pte_list\n",
+    "# has a different pointer!\n",
     "list_of_pte_lists = [[] for i in range(n_subsamp)]\n",
     "for t_ar in test_arrays:\n",
     "    null_list, crosspecs = get_null_list(surveys, t_ar, spectra)\n",
+    "\n",
+    "    # get the minimal subsamples that do not repeat a leg more than twice\n",
     "    best_crosspecs = get_best_null_sets(crosspecs)\n",
     "    \n",
-    "    seed(2) # same sample of best_crosspecs for each array\n",
+    "    # if there are a lot of possible subsamples, choose only a few of them\n",
+    "    seed(2) # same subsamples chosen for each array\n",
     "    crosspecs_samples = sample(best_crosspecs, n_subsamp)\n",
-    "    print(crosspecs_samples)\n",
+    "    print(crosspecs_samples) \n",
     "\n",
+    "    # do the following for each subsample of null tests\n",
     "    for i, crosspecs in enumerate(crosspecs_samples):\n",
+    "\n",
+    "        # check each null if we want to include it\n",
     "        for null in null_list:\n",
     "            mode, (ms1, ms2), (ms3, ms4) = null\n",
+    "\n",
+    "            # skip if the null is not in our subsample! note we also need to check the \n",
+    "            # \"backwards\" version of the null too\n",
     "            if [(ms1, ms2), (ms3, ms4)] not in crosspecs and [(ms3, ms4), (ms1, ms2)] not in crosspecs:\n",
     "                # print([(ms1, ms2), (ms3, ms4)], 'not in', crosspecs)\n",
     "                continue\n",
@@ -469,6 +572,7 @@
     "            else:\n",
     "                my_ar = t_ar[0][:8]\n",
     "\n",
+    "            # same checks as before\n",
     "            if (ms1 == ms2) & (ms3 == ms4) & (mode in [\"ET\", \"BT\", \"BE\"]) :\n",
     "                # print(f\"skip {ms1}x{ms2} - {ms3}x{ms4} {mode} since it's a doublon of {mode[::-1]}\")\n",
     "                continue\n",
@@ -552,14 +656,16 @@
     }
    ],
    "source": [
+    "# repeat for inout\n",
+    "\n",
     "null_test = \"inout\"\n",
     "spectra = [\"TT\", \"TE\", \"TB\", \"ET\", \"BT\", \"EE\", \"EB\", \"BE\", \"BB\"]\n",
     "\n",
     "if null_test == \"elevation\":\n",
     "    surveys = [\"el1\", \"el2\", \"el3\"]\n",
     "    test_arrays = [[\"pa4_f220\"], [\"pa5_f090\"], [\"pa5_f150\"],  [\"pa6_f090\"], [\"pa6_f150\"]]\n",
     "    n_bins = 60\n",
-    "    n_subsamp = 10\n",
+    "    n_subsamp = 100\n",
     "\n",
     "if null_test == \"pwv\":\n",
     "    surveys = [\"pwv1\", \"pwv2\"]\n",
@@ -1845,6 +1951,9 @@
     }
    ],
    "source": [
+    "# repeat for elevation. note, unlike pwv and inout which had only 3 possible subsamples,\n",
+    "# elevation has a very large number. thus the \"n_subsamp\" variable becomes more important!\n",
+    "\n",
     "null_test = \"elevation\"\n",
     "spectra = [\"TT\", \"TE\", \"TB\", \"ET\", \"BT\", \"EE\", \"EB\", \"BE\", \"BB\"]\n",
     "\n",

diff --git a/project/data_analysis/python/null_tests/custom_nulls/toy_model_ks.py b/project/data_analysis/python/null_tests/custom_nulls/toy_model_ks.py
@@ -2,29 +2,54 @@
 from scipy import stats
 from scipy.interpolate import interp1d
 
-nbin = 39 
+# Our simulation setup is as follows. A given simulation involves 99 null tests,
+# matching the number of pwv and inout null tests. This 99 is assumed to be
+# composed of 33 independent test sets (a test set could be a separate array
+# or polarization cross), each of which is composed of 3 pwv cross differences:
+# pwv1pwv1 - pwv1pwv2, pwv1pwv2 - pwv2pwv2, pwv1pwv1 - pwv2pwv2. Each "spectrum"
+# has 39 "ell bins.""
 nsim = 20_000 
 ntest = 33  
+nbin = 39 
+
 sim_ptes = np.zeros((nsim, ntest, 3))
+
 for sidx in range(nsim): 
-    if sidx % 100 == 0: print(sidx)
+    if sidx % 100 == 0:
+        print(sidx)
+
     for tidx in range(ntest): 
-        p11 = np.random.randn(nbin)  
-        p22 = np.random.randn(nbin)  
-        p12 = np.random.randn(nbin) 
+
+        # simulate the 3 pwv cross spectra as Gaussian r.v's
+        p11 = np.random.randn(nbin) 
+        p22 = np.random.randn(nbin) 
+        p12 = np.random.randn(nbin)
+
+        # form the 3 null test residuals. this step introduces correlations
+        # between null tests
         res_a = p11 - p22 
         res_b = p12 - p22 
         res_c = p11 - p12 
+
+        # get the chi2 of each null test, assuming diagonal over null tests
+        # and ell bins. The variance of a null test residual in a given ell 
+        # bin is 2.
         chi2_a = np.sum(res_a ** 2 / 2) 
         chi2_b = np.sum(res_b ** 2 / 2) 
         chi2_c = np.sum(res_c ** 2 / 2) 
+
+        # Get the ptes of the 3 null tests
         pte_a = 1 - stats.chi2.cdf(chi2_a, nbin)  
         pte_b = 1 - stats.chi2.cdf(chi2_b, nbin) 
         pte_c = 1 - stats.chi2.cdf(chi2_c, nbin) 
+
+        # For each sim, we have 99 null tests; they are correlated along
+        # the last axis which holds the pwv residuals
         sim_ptes[sidx,tidx,0] = pte_a 
         sim_ptes[sidx,tidx,1] = pte_b  
         sim_ptes[sidx,tidx,2] = pte_c
 
+# Evaluate the kstest of the 99 null tests for each of our sims
 ks_tests = np.zeros(nsim) 
 for sidx in range(nsim):
     ks_tests[sidx] = stats.kstest(
@@ -38,5 +63,5 @@
 ks_test_pwv = 0.018
 ks_test_inout = 0.039
 
-print(1 - cdf_interp(ks_test_pwv))
-print(1 - cdf_interp(ks_test_pwv))
+print(cdf_interp(ks_test_pwv))
+print(cdf_interp(ks_test_pwv))