Skip to content

Commit

Permalink
Improve notebook to handle max_scaledown param and more accurate calcs
Browse files Browse the repository at this point in the history
To squash into 66d79b8.
  • Loading branch information
boydgreenfield committed Feb 11, 2023
1 parent 2b51253 commit 413506f
Showing 1 changed file with 75 additions and 18 deletions.
93 changes: 75 additions & 18 deletions docs/notebook/calculate-parameters.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -24,7 +24,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -95,7 +95,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -116,46 +116,103 @@
"bits_per_element = m_over_n * kappa\n",
"\n",
"# Next, we compute the implied indeterminacy error rate and the required number and size of secondary arrays\n",
"beta_without_correction = stats.binom.cdf(1, nu - kappa, p) - stats.binom.cdf(0, nu - kappa, p) # this is also the scaledown factor\n",
"n_secondaries = -1\n",
"uncorrected_beta = stats.binom.cdf(1, nu - kappa, p) - stats.binom.cdf(0, nu - kappa, p) # this is also the scaledown factor\n",
"n_secondaries = 0\n",
"calculated_indeterminacy_rate = np.inf\n",
"\n",
"# TODO: This can be refined to allow a maximum scaledown of the array (as we do in the implementation)\n",
"#\n",
"secondary_array_size = N_ELEMENTS\n",
"expected_indeterminate_results = int(N_ELEMENTS * uncorrected_beta)\n",
"debug = False\n",
"while calculated_indeterminacy_rate > MAX_INDETERMINACY_RATE:\n",
" # Stop if the expected number of indeterminate results is < 0.5 \n",
" if expected_indeterminate_results < 0.5:\n",
" break\n",
"\n",
" # Scale the secondary array down by the uncorrected 𝛽\n",
" n_secondaries += 1 \n",
" calculated_indeterminacy_rate = np.power(beta_without_correction, n_secondaries)\n",
" expected_indeterminate_results = N_ELEMENTS * calculated_indeterminacy_rate\n",
" if expected_indeterminate_results < 0.1:\n",
" calculated_indeterminacy_rate = 0.0"
" secondary_array_size = int(secondary_array_size * uncorrected_beta)\n",
" \n",
" # But never make an array smaller than N_ELEMENTS * MAX_SCALEDOWN\n",
" if secondary_array_size < N_ELEMENTS * MAX_SCALEDOWN:\n",
" secondary_array_size = int(N_ELEMENTS * MAX_SCALEDOWN)\n",
"\n",
" if debug:\n",
" print(f\"The #{n_secondaries} secondary array will be {secondary_array_size:,} elements ({int(expected_indeterminate_results):,} expected elements)\")\n",
" \n",
" # Now calculate the expected number of indeterminate results flowing *out* of the nth secondary array\n",
" secondary_array_size_bits = secondary_array_size * bits_per_element\n",
" corrected_m_over_n = (secondary_array_size / expected_indeterminate_results) * m_over_n\n",
" corrected_p = calculate_fp_rate(corrected_m_over_n, n_hashes)\n",
" \n",
" # Heuristic: But don't allow p to be set to 0, always use at least 10-e7 (1 in 1M)\n",
" corrected_p = max(10e-7, corrected_p)\n",
" corrected_beta = stats.binom.cdf(1, nu - kappa, corrected_p) - stats.binom.cdf(0, nu - kappa, corrected_p)\n",
" expected_indeterminate_results = expected_indeterminate_results * corrected_beta\n",
" \n",
" if debug:\n",
" print(f\"Expect {int(expected_indeterminate_results):,} indeterminate results in next array ({corrected_m_over_n}, corrected p {corrected_p:.10f}), corrected beta {corrected_beta:.4f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Input configuration requirements are:\n",
"\n",
"`MAX_VALUE` (𝜃) = 1000\n",
"`MAX_FALSE_POSITIVE_RATE` (𝛼) = 0.001\n",
"`MAX_INDETERMINACY_RATE` (corrected 𝛽) = 0\n",
"`N_ELEMENTS` (n) = 10,000,000,000\n",
"`MAX_SCALEDOWN` = 0.001 (recommended standard value)\n",
"\n",
"Recommended parameters are: \n",
"\n",
"`size` (mκ) = 300,000,000,000\n",
"`n_hashes` (k) = 7\n",
"`marker_width` (ν) = 46\n",
"`n_marker_bits` (κ) = 2\n",
"`secondary_scaledown` (uncorrected Array_0 β) = 0.043\n",
"`max_scaledown` (-) = 0.001 (recommended standard value)\n",
"`n_secondaries` (number of Array_x's) = 4\n",
"\n"
]
}
],
"source": [
"print(f\"\"\"\n",
"Input configuration requirements are:\n",
"\n",
"`MAX_VALUE` (𝜃) = {MAX_VALUE}\n",
"`MAX_FALSE_POSITIVE_RATE` (𝛼) = {MAX_FALSE_POSITIVE_RATE}\n",
"`MAX_INDETERMINACY_RATE` (𝛽) = {MAX_INDETERMINACY_RATE}\n",
"`N_ELEMENTS` (n) = {int(N_ELEMENTS)}\n",
"`MAX_INDETERMINACY_RATE` (corrected 𝛽) = {MAX_INDETERMINACY_RATE}\n",
"`N_ELEMENTS` (n) = {int(N_ELEMENTS):,}\n",
"`MAX_SCALEDOWN` = {MAX_SCALEDOWN} (recommended standard value)\n",
"\n",
"Recommended parameters are: \n",
"\n",
"`size` (mκ) = {N_ELEMENTS * m_over_n * kappa}\n",
"`size` (mκ) = {int(N_ELEMENTS * m_over_n * kappa):,}\n",
"`n_hashes` (k) = {n_hashes}\n",
"`marker_width` (ν) = {nu}\n",
"`n_marker_bits` (κ) = {kappa}\n",
"`secondary_scaledown` (uncorrected Array_0 β) = {beta_without_correction}\n",
"`secondary_scaledown` (uncorrected Array_0 β) = {np.ceil(uncorrected_beta * 1000)/1000:.3f}\n",
"`max_scaledown` (-) = {MAX_SCALEDOWN} (recommended standard value)\n",
"`n_secondaries` (number of Array_x's) = {n_secondaries}\n",
"\"\"\")\n",
"#nu, kappa, m_over_n, n_hashes, alpha, p, beta_without_correction, n_secondaries"
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down

0 comments on commit 413506f

Please sign in to comment.