Improve notebook to handle max_scaledown param and more accurate calcs

To squash into 66d79b8.
onecodex · Feb 11, 2023 · 413506f · 413506f
1 parent 2b51253
commit 413506f
Showing 1 changed file with 75 additions and 18 deletions.
diff --git a/docs/notebook/calculate-parameters.ipynb b/docs/notebook/calculate-parameters.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -24,7 +24,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -95,7 +95,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -116,46 +116,103 @@
     "bits_per_element = m_over_n * kappa\n",
     "\n",
     "# Next, we compute the implied indeterminacy error rate and the required number and size of secondary arrays\n",
-    "beta_without_correction = stats.binom.cdf(1, nu - kappa, p) - stats.binom.cdf(0, nu - kappa, p)  # this is also the scaledown factor\n",
-    "n_secondaries = -1\n",
+    "uncorrected_beta = stats.binom.cdf(1, nu - kappa, p) - stats.binom.cdf(0, nu - kappa, p)  # this is also the scaledown factor\n",
+    "n_secondaries = 0\n",
     "calculated_indeterminacy_rate = np.inf\n",
     "\n",
-    "# TODO: This can be refined to allow a maximum scaledown of the array (as we do in the implementation)\n",
+    "#\n",
+    "secondary_array_size = N_ELEMENTS\n",
+    "expected_indeterminate_results = int(N_ELEMENTS * uncorrected_beta)\n",
+    "debug = False\n",
     "while calculated_indeterminacy_rate > MAX_INDETERMINACY_RATE:\n",
+    "    # Stop if the expected number of indeterminate results is < 0.5    \n",
+    "    if expected_indeterminate_results < 0.5:\n",
+    "        break\n",
+    "\n",
+    "    # Scale the secondary array down by the uncorrected 𝛽\n",
     "    n_secondaries += 1    \n",
-    "    calculated_indeterminacy_rate = np.power(beta_without_correction, n_secondaries)\n",
-    "    expected_indeterminate_results = N_ELEMENTS * calculated_indeterminacy_rate\n",
-    "    if expected_indeterminate_results < 0.1:\n",
-    "        calculated_indeterminacy_rate = 0.0"
+    "    secondary_array_size = int(secondary_array_size * uncorrected_beta)\n",
+    "    \n",
+    "    # But never make an array smaller than N_ELEMENTS * MAX_SCALEDOWN\n",
+    "    if secondary_array_size < N_ELEMENTS * MAX_SCALEDOWN:\n",
+    "        secondary_array_size = int(N_ELEMENTS * MAX_SCALEDOWN)\n",
+    "\n",
+    "    if debug:\n",
+    "        print(f\"The #{n_secondaries} secondary array will be {secondary_array_size:,} elements ({int(expected_indeterminate_results):,} expected elements)\")\n",
+    "        \n",
+    "    # Now calculate the expected number of indeterminate results flowing *out* of the nth secondary array\n",
+    "    secondary_array_size_bits = secondary_array_size * bits_per_element\n",
+    "    corrected_m_over_n = (secondary_array_size / expected_indeterminate_results) * m_over_n\n",
+    "    corrected_p = calculate_fp_rate(corrected_m_over_n, n_hashes)\n",
+    "    \n",
+    "    # Heuristic: But don't allow p to be set to 0, always use at least 10-e7 (1 in 1M)\n",
+    "    corrected_p = max(10e-7, corrected_p)\n",
+    "    corrected_beta = stats.binom.cdf(1, nu - kappa, corrected_p) - stats.binom.cdf(0, nu - kappa, corrected_p)\n",
+    "    expected_indeterminate_results = expected_indeterminate_results * corrected_beta\n",
+    "    \n",
+    "    if debug:\n",
+    "        print(f\"Expect {int(expected_indeterminate_results):,} indeterminate results in next array ({corrected_m_over_n}, corrected p {corrected_p:.10f}), corrected beta {corrected_beta:.4f}\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Input configuration requirements are:\n",
+      "\n",
+      "`MAX_VALUE` (𝜃) = 1000\n",
+      "`MAX_FALSE_POSITIVE_RATE` (𝛼) = 0.001\n",
+      "`MAX_INDETERMINACY_RATE` (corrected 𝛽) = 0\n",
+      "`N_ELEMENTS` (n) = 10,000,000,000\n",
+      "`MAX_SCALEDOWN` = 0.001 (recommended standard value)\n",
+      "\n",
+      "Recommended parameters are: \n",
+      "\n",
+      "`size` (mκ) = 300,000,000,000\n",
+      "`n_hashes` (k) = 7\n",
+      "`marker_width` (ν) = 46\n",
+      "`n_marker_bits` (κ) = 2\n",
+      "`secondary_scaledown` (uncorrected Array_0 β) = 0.043\n",
+      "`max_scaledown` (-) = 0.001 (recommended standard value)\n",
+      "`n_secondaries` (number of Array_x's) = 4\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "print(f\"\"\"\n",
     "Input configuration requirements are:\n",
     "\n",
     "`MAX_VALUE` (𝜃) = {MAX_VALUE}\n",
     "`MAX_FALSE_POSITIVE_RATE` (𝛼) = {MAX_FALSE_POSITIVE_RATE}\n",
-    "`MAX_INDETERMINACY_RATE` (𝛽) = {MAX_INDETERMINACY_RATE}\n",
-    "`N_ELEMENTS` (n) = {int(N_ELEMENTS)}\n",
+    "`MAX_INDETERMINACY_RATE` (corrected 𝛽) = {MAX_INDETERMINACY_RATE}\n",
+    "`N_ELEMENTS` (n) = {int(N_ELEMENTS):,}\n",
     "`MAX_SCALEDOWN` = {MAX_SCALEDOWN} (recommended standard value)\n",
     "\n",
     "Recommended parameters are: \n",
     "\n",
-    "`size` (mκ) = {N_ELEMENTS * m_over_n * kappa}\n",
+    "`size` (mκ) = {int(N_ELEMENTS * m_over_n * kappa):,}\n",
     "`n_hashes` (k) = {n_hashes}\n",
     "`marker_width` (ν) = {nu}\n",
     "`n_marker_bits` (κ) = {kappa}\n",
-    "`secondary_scaledown` (uncorrected Array_0 β) = {beta_without_correction}\n",
+    "`secondary_scaledown` (uncorrected Array_0 β) = {np.ceil(uncorrected_beta * 1000)/1000:.3f}\n",
     "`max_scaledown` (-) = {MAX_SCALEDOWN} (recommended standard value)\n",
     "`n_secondaries` (number of Array_x's) = {n_secondaries}\n",
-    "\"\"\")\n",
-    "#nu, kappa, m_over_n, n_hashes, alpha, p, beta_without_correction, n_secondaries"
+    "\"\"\")"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {