From 451ce683c1d1b77766b5cf302ef4f6bb88c3930b Mon Sep 17 00:00:00 2001 From: pkaf Date: Wed, 16 Sep 2020 14:00:47 +0800 Subject: [PATCH 1/3] Added sample size estimator for a hypothesis testing in case of binomial proportion. --- docs/sources/CHANGELOG.md | 2 +- mlxtend/evaluate/sample_size_estimate.py | 78 +++++++++++++++++++ .../tests/test_sample_size_estimate.py | 18 +++++ 3 files changed, 97 insertions(+), 1 deletion(-) create mode 100644 mlxtend/evaluate/sample_size_estimate.py create mode 100644 mlxtend/evaluate/tests/test_sample_size_estimate.py diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md index 8a57ddc1d..4a7945e7e 100755 --- a/docs/sources/CHANGELOG.md +++ b/docs/sources/CHANGELOG.md @@ -21,7 +21,7 @@ The CHANGELOG for the current development version is available at - The `bias_variance_decomp` now supports Keras estimators. ([#725](https://github.com/rasbt/mlxtend/pull/725) via [@hanzigs](https://github.com/hanzigs)) - Adds new `OneRClassifier` (One Rule Classfier) ([#726](https://github.com/rasbt/mlxtend/pull/726) - +- Sample size estimator for a binomial proportion added. (#729 via [@pkaf](https://github.com/pkaf))) ##### Changes diff --git a/mlxtend/evaluate/sample_size_estimate.py b/mlxtend/evaluate/sample_size_estimate.py new file mode 100644 index 000000000..2e2ca0281 --- /dev/null +++ b/mlxtend/evaluate/sample_size_estimate.py @@ -0,0 +1,78 @@ +# Sebastian Raschka 2014-2020 +# mlxtend Machine Learning Library Extensions +# +# A function to estimate sample size for hypothesis testing. +# Author: Prajwal Kafle +# +# License: BSD 3 clause + +from scipy import stats as st + + +def binomial_proportions(baseline_conversion, minimum_effect, + confidence_level=95, power=80, + test='two-sided'): + """ This function estimates the sample size to detect minimum uptake in a control + group relative to a given baseline conversion rate of a control group + with a given power and confidence level. + + Parameters + ---------- + baseline_conversion: percentage + Conversion rate of variant A/control group. + minimum_effect: percentage + Minimum detectable effect determines conversion rate of variant B/treatment group. + confidence_level: percentage (float, default: 95) + where, significance level = 1 - confidence_level/100. + For a confidence level of 95%, alpha = 0.05. + power: Statistical power in percentage (default: 80%) + beta = 1 - power/100 is Type II error. + test: str (default: two-sided) + Option one-sided is valid too. + + Returns + ------- + n: int + sample size + + Notes + ------ + Function uses t-distribution to find critical values + + Reference + --------- + https://select-statistics.co.uk/calculators/sample-size-calculator-two-proportions/ + + """ + + if minimum_effect <= 0: + raise ValueError("Function is valid only for a positive uptake (i.e. minimum_effect > 0 ).") + + if baseline_conversion <= 0: + raise ValueError("Function is valid only for a positive baseline conversion rate.") + + # type I error + alpha = 1 - confidence_level/100 + + if test == 'two-sided': + alpha = alpha/2 + + # Critical values of the t-distribution at alpha/2 and beta + z_crit = lambda x: st.t.ppf(x, df=1000) + z_alpha_over_2 = z_crit(1 - alpha) + z_beta = z_crit(power/100) + z_total = z_alpha_over_2 + z_beta + + # Converting percentages to proportion + conversion_rate_variant_a = baseline_conversion/100 + + # Conversion rate of variant B is increment in conversion rate of variant A + # by minimum detectable effect. + conversion_rate_variant_b = (1 + minimum_effect/100)*conversion_rate_variant_a + + n = (z_total/(conversion_rate_variant_a - conversion_rate_variant_b))**2 * \ + (conversion_rate_variant_a * (1 - conversion_rate_variant_a) + + conversion_rate_variant_b * (1 - conversion_rate_variant_b)) + + return int(n) + diff --git a/mlxtend/evaluate/tests/test_sample_size_estimate.py b/mlxtend/evaluate/tests/test_sample_size_estimate.py new file mode 100644 index 000000000..b4b523ed1 --- /dev/null +++ b/mlxtend/evaluate/tests/test_sample_size_estimate.py @@ -0,0 +1,18 @@ +# Sebastian Raschka 2014-2020 +# mlxtend Machine Learning Library Extensions +# Author: Prajwal Kafle +# +# License: BSD 3 clause + +from mlxtend.evaluate import sample_size_estimate + + +def test_one_sided_binomial_proportion_estimates(): + n = sample_size_estimate.binomial_proportions(15, 20, 95, 80, + test='one-sided') + assert 1892 == n + + +def test_two_sided_binomial_proportion_estimates(): + n = sample_size_estimate.binomial_proportions(15, 20, 95, 80) + assert 2403 == n From 6c2a7e48fb3870d95f61d0e9d4dd6673120b2d68 Mon Sep 17 00:00:00 2001 From: pkaf Date: Wed, 16 Sep 2020 14:20:41 +0800 Subject: [PATCH 2/3] checked flake8 consistency for the sample size estimator --- mlxtend/evaluate/sample_size_estimate.py | 28 ++++++++++++++---------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/mlxtend/evaluate/sample_size_estimate.py b/mlxtend/evaluate/sample_size_estimate.py index 2e2ca0281..2b38fc384 100644 --- a/mlxtend/evaluate/sample_size_estimate.py +++ b/mlxtend/evaluate/sample_size_estimate.py @@ -12,16 +12,18 @@ def binomial_proportions(baseline_conversion, minimum_effect, confidence_level=95, power=80, test='two-sided'): - """ This function estimates the sample size to detect minimum uptake in a control - group relative to a given baseline conversion rate of a control group - with a given power and confidence level. + """ This function estimates the sample size to detect + minimum uptake in a control group relative to a given baseline + conversion rate of a control group with a given power and + confidence level. Parameters ---------- baseline_conversion: percentage Conversion rate of variant A/control group. minimum_effect: percentage - Minimum detectable effect determines conversion rate of variant B/treatment group. + Minimum detectable effect determines conversion rate of + variant B/treatment group. confidence_level: percentage (float, default: 95) where, significance level = 1 - confidence_level/100. For a confidence level of 95%, alpha = 0.05. @@ -46,10 +48,12 @@ def binomial_proportions(baseline_conversion, minimum_effect, """ if minimum_effect <= 0: - raise ValueError("Function is valid only for a positive uptake (i.e. minimum_effect > 0 ).") + raise ValueError("Function is valid only for a " + "positive uptake (i.e. minimum_effect > 0 ).") if baseline_conversion <= 0: - raise ValueError("Function is valid only for a positive baseline conversion rate.") + raise ValueError("Function is valid only for a " + "positive baseline conversion rate.") # type I error alpha = 1 - confidence_level/100 @@ -58,9 +62,8 @@ def binomial_proportions(baseline_conversion, minimum_effect, alpha = alpha/2 # Critical values of the t-distribution at alpha/2 and beta - z_crit = lambda x: st.t.ppf(x, df=1000) - z_alpha_over_2 = z_crit(1 - alpha) - z_beta = z_crit(power/100) + z_alpha_over_2 = st.t.ppf(1 - alpha, df=1000) + z_beta = st.t.ppf(power/100, df=1000) z_total = z_alpha_over_2 + z_beta # Converting percentages to proportion @@ -68,11 +71,12 @@ def binomial_proportions(baseline_conversion, minimum_effect, # Conversion rate of variant B is increment in conversion rate of variant A # by minimum detectable effect. - conversion_rate_variant_b = (1 + minimum_effect/100)*conversion_rate_variant_a + conversion_rate_variant_b = \ + (1 + minimum_effect/100) * conversion_rate_variant_a - n = (z_total/(conversion_rate_variant_a - conversion_rate_variant_b))**2 * \ + n = (z_total/(conversion_rate_variant_a - + conversion_rate_variant_b))**2 * \ (conversion_rate_variant_a * (1 - conversion_rate_variant_a) + conversion_rate_variant_b * (1 - conversion_rate_variant_b)) return int(n) - From 0e2b4b7105f4be9824ecab2313fefadeb628b37f Mon Sep 17 00:00:00 2001 From: pkaf Date: Wed, 16 Sep 2020 14:24:09 +0800 Subject: [PATCH 3/3] changelog updated to reflect correct pull request number. --- docs/sources/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md index 4a7945e7e..4fe8156ee 100755 --- a/docs/sources/CHANGELOG.md +++ b/docs/sources/CHANGELOG.md @@ -21,7 +21,7 @@ The CHANGELOG for the current development version is available at - The `bias_variance_decomp` now supports Keras estimators. ([#725](https://github.com/rasbt/mlxtend/pull/725) via [@hanzigs](https://github.com/hanzigs)) - Adds new `OneRClassifier` (One Rule Classfier) ([#726](https://github.com/rasbt/mlxtend/pull/726) -- Sample size estimator for a binomial proportion added. (#729 via [@pkaf](https://github.com/pkaf))) +- Sample size estimator for a binomial proportion added. ([#730](https://github.com/rasbt/mlxtend/pull/730) via [@pkaf](https://github.com/pkaf))) ##### Changes