fix merge issue

statsthinking21 · Apr 1, 2024 · 2b4fd55 · 2b4fd55
2 parents 75ac25c + 3092228
commit 2b4fd55
Show file tree

Hide file tree

Showing 15 changed files with 37 additions and 31 deletions.
diff --git a/notebooks/03-DataVisualization.ipynb b/notebooks/03-DataVisualization.ipynb
@@ -387,8 +387,8 @@
     "                data=adult_nhanes_data)\n",
     "plt.plot([adult_nhanes_data['SystolicBloodPres1StRdgMmHg'].min(),\n",
     "          adult_nhanes_data['SystolicBloodPres1StRdgMmHg'].max()],\n",
-    "          [adult_nhanes_data['SystolicBloodPres2NdRdgMmHg'].min(),\n",
-    "          adult_nhanes_data['SystolicBloodPres2NdRdgMmHg'].max()],\n",
+    "          [adult_nhanes_data['SystolicBloodPres1StRdgMmHg'].min(),\n",
+    "          adult_nhanes_data['SystolicBloodPres1StRdgMmHg'].max()],\n",
     "          color='k')\n",
     "plt.xlabel('Systolic BP - First reading')\n",
     "plt.ylabel('Systolic BP - Second reading')"

diff --git a/notebooks/03-DataVisualization.py b/notebooks/03-DataVisualization.py
@@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.4.2
+#       jupytext_version: 1.15.2
 #   kernelspec:
 #     display_name: Python 3
 #     language: python
@@ -152,8 +152,8 @@
                 data=adult_nhanes_data)
 plt.plot([adult_nhanes_data['SystolicBloodPres1StRdgMmHg'].min(),
           adult_nhanes_data['SystolicBloodPres1StRdgMmHg'].max()],
-          [adult_nhanes_data['SystolicBloodPres2NdRdgMmHg'].min(),
-          adult_nhanes_data['SystolicBloodPres2NdRdgMmHg'].max()],
+          [adult_nhanes_data['SystolicBloodPres1StRdgMmHg'].min(),
+          adult_nhanes_data['SystolicBloodPres1StRdgMmHg'].max()],
           color='k')
 plt.xlabel('Systolic BP - First reading')
 plt.ylabel('Systolic BP - Second reading')

diff --git a/notebooks/06-Sampling.ipynb b/notebooks/06-Sampling.ipynb
@@ -79,7 +79,7 @@
     "# we need to use the maximum of those data to set\n",
     "# the height of the vertical line that shows the mean\n",
     "plt.axvline(x=adult_nhanes_data['Height'].mean(),\n",
-    "            ymax=np.max(hist[0]), color='k')\n",
+    "            ymax=1, color='k')\n",
     "\n",
     "# draw the normal distribution with same mean and standard deviation\n",
     "# as the sampling distribution\n",

diff --git a/notebooks/06-Sampling.py b/notebooks/06-Sampling.py
@@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.4.2
+#       jupytext_version: 1.15.2
 #   kernelspec:
 #     display_name: Python 3
 #     language: python
@@ -62,7 +62,7 @@
 # we need to use the maximum of those data to set
 # the height of the vertical line that shows the mean
 plt.axvline(x=adult_nhanes_data['Height'].mean(),
-            ymax=np.max(hist[0]), color='k')
+            ymax=1, color='k')
 
 # draw the normal distribution with same mean and standard deviation
 # as the sampling distribution

diff --git a/notebooks/07-ResamplingAndSimulation.ipynb b/notebooks/07-ResamplingAndSimulation.ipynb
@@ -97,7 +97,7 @@
     "lines_to_next_cell": 2
    },
    "source": [
-    "Now let's find the 99th percentile of the maximum distriibution.  There is a built-in function in the `scipy.stats` module, called `scoreatpercentile` that will do this for us:\n"
+    "Now let's find the 95th percentile of the maximum distriibution.  There is a built-in function in the `scipy.stats` module, called `scoreatpercentile` that will do this for us:\n"
    ]
   },
   {

diff --git a/notebooks/07-ResamplingAndSimulation.py b/notebooks/07-ResamplingAndSimulation.py
@@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.4.2
+#       jupytext_version: 1.15.2
 #   kernelspec:
 #     display_name: Python 3
 #     language: python
@@ -83,7 +83,7 @@ def sample_and_return_max(sample_size,
 
 
 # %% [markdown]
-# Now let's find the 99th percentile of the maximum distriibution.  There is a built-in function in the `scipy.stats` module, called `scoreatpercentile` that will do this for us:
+# Now let's find the 95th percentile of the maximum distriibution.  There is a built-in function in the `scipy.stats` module, called `scoreatpercentile` that will do this for us:
 #
 
 

diff --git a/notebooks/08-HypothesisTesting.ipynb b/notebooks/08-HypothesisTesting.ipynb
@@ -57,7 +57,7 @@
    "source": [
     "import scipy.stats\n",
     "\n",
-    "pvalue = 100 - scipy.stats.percentileofscore(flip_results_df, 0.7) \n",
+    "pvalue = 100 - scipy.stats.percentileofscore(flip_results_df, 70) \n",
     "pvalue"
    ]
   },
@@ -144,7 +144,8 @@
     "for run in range(num_runs):\n",
     "    sim_results_df.loc[run, 'p_value'] = sample_ttest()\n",
     "\n",
-    "p_error = sim_results_df.loc[sim_results_df['p_value'] < 0.05].mean(axis=0)\n",
+    "p_error = sim_results_df['p_value'] < 0.05\n",
+    "p_error = p_error.mean(axis=0)\n",
     "p_error"
    ]
   },

diff --git a/notebooks/08-HypothesisTesting.py b/notebooks/08-HypothesisTesting.py
@@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.4.2
+#       jupytext_version: 1.15.2
 #   kernelspec:
 #     display_name: Python 3
 #     language: python
@@ -50,7 +50,7 @@ def toss_coins_and_count_heads(num_coins=100, p_heads=0.5):
 # %%
 import scipy.stats
 
-pvalue = 100 - scipy.stats.percentileofscore(flip_results_df, 0.7) 
+pvalue = 100 - scipy.stats.percentileofscore(flip_results_df, 70) 
 pvalue
 
 # %% [markdown]
@@ -99,7 +99,8 @@ def sample_ttest(sampSize=32):
 for run in range(num_runs):
     sim_results_df.loc[run, 'p_value'] = sample_ttest()
 
-p_error = sim_results_df.loc[sim_results_df['p_value'] < 0.05].mean(axis=0)
+p_error = sim_results_df['p_value'] < 0.05
+p_error = p_error.mean(axis=0)
 p_error
 
 # %% [markdown]

diff --git a/notebooks/09-StatisticalPower.ipynb b/notebooks/09-StatisticalPower.ipynb
@@ -37,7 +37,7 @@
    "source": [
     "## Power analysis\n",
     "\n",
-    "We can compute a power analysis using functions from the `statsmodels.stats.power` package. Let's focus on the power for an independent samples t-test in order to determine a difference in the mean between two groups.  Let's say that we think than an effect size of Cohen's d=0.5 is realistic for the study in question (based on previous research) and would be of scientific interest.  We wish to have 80% power to find the effect if it exists.  We can compute the sample size needed for adequate power using the `TTestIndPower()` function:"
+    "We can compute a power analysis using functions from the `statsmodels.stats.power` package. Let's focus on the power for an independent samples t-test in order to determine a difference in the mean between two groups.  Let's say that we think that an effect size of Cohen's d=0.5 is realistic for the study in question (based on previous research) and would be of scientific interest.  We wish to have 80% power to find the effect if it exists.  We can compute the sample size needed for adequate power using the `TTestIndPower()` function:"
    ]
   },
   {

diff --git a/notebooks/09-StatisticalPower.py b/notebooks/09-StatisticalPower.py
@@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.4.2
+#       jupytext_version: 1.15.2
 #   kernelspec:
 #     display_name: Python 3
 #     language: python
@@ -33,7 +33,7 @@
 # %% [markdown]
 # ## Power analysis
 #
-# We can compute a power analysis using functions from the `statsmodels.stats.power` package. Let's focus on the power for an independent samples t-test in order to determine a difference in the mean between two groups.  Let's say that we think than an effect size of Cohen's d=0.5 is realistic for the study in question (based on previous research) and would be of scientific interest.  We wish to have 80% power to find the effect if it exists.  We can compute the sample size needed for adequate power using the `TTestIndPower()` function:
+# We can compute a power analysis using functions from the `statsmodels.stats.power` package. Let's focus on the power for an independent samples t-test in order to determine a difference in the mean between two groups.  Let's say that we think that an effect size of Cohen's d=0.5 is realistic for the study in question (based on previous research) and would be of scientific interest.  We wish to have 80% power to find the effect if it exists.  We can compute the sample size needed for adequate power using the `TTestIndPower()` function:
 
 # %%
 

diff --git a/notebooks/10-BayesianStatistics.ipynb b/notebooks/10-BayesianStatistics.ipynb
@@ -9,16 +9,14 @@
     "\n",
     "## Applying Bayes' theorem: A simple example\n",
     "TBD: MOVE TO MULTIPLE TESTING EXAMPLE SO WE CAN USE BINOMIAL LIKELIHOOD\n",
-    "A person has a cough and flu-like symptoms, and gets a PCR test for COVID-19, which comes back postiive.  What is the likelihood that they actually have COVID-19, as opposed a regular cold or flu?  We can use Bayes' theorem to compute this.  Let's say that the local rate of symptomatic individuals who actually are infected with COVID-19 is 7.4% (as [reported](https://twitter.com/Bob_Wachter/status/1281792549309386752/photo/1) on July 10, 2020 for San Francisco); thus, our prior probability that someone with symptoms actually has COVID-19 is .074.  The RT-PCR test used to identify COVID-19 RNA is highly specific (that is, it very rarelly reports the presence of the virus when it is not present); for our example, we will say that the specificity is 99%.  Its sensitivity is not known, but probably is no higher than 90%.  \n",
+    "A person has a cough and flu-like symptoms, and gets a PCR test for COVID-19, which comes back postiive.  What is the likelihood that they actually have COVID-19, as opposed to a regular cold or flu?  We can use Bayes' theorem to compute this.  Let's say that the local rate of symptomatic individuals who actually are infected with COVID-19 is 7.4% (as [reported](https://twitter.com/Bob_Wachter/status/1281792549309386752/photo/1) on July 10, 2020 for San Francisco); thus, our prior probability that someone with symptoms actually has COVID-19 is .074.  The RT-PCR test used to identify COVID-19 RNA is highly specific (that is, it very rarelly reports the presence of the virus when it is not present); for our example, we will say that the specificity is 99%.  Its sensitivity is not known, but probably is no higher than 90%.  \n",
     "First let's look at the probability of disease given a single positive test."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "lines_to_next_cell": 3
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "\n",
@@ -29,6 +27,10 @@
     "marginal_likelihood = sensitivity * prior + (1 - specificity) * (1 - prior)\n",
     "posterior = (likelihood * prior) / marginal_likelihood\n",
     "posterior\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
     "\n"
    ]
   },

diff --git a/notebooks/10-BayesianStatistics.py b/notebooks/10-BayesianStatistics.py
@@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.4.2
+#       jupytext_version: 1.15.2
 #   kernelspec:
 #     display_name: Python 3
 #     language: python
@@ -19,7 +19,7 @@
 #
 # ## Applying Bayes' theorem: A simple example
 # TBD: MOVE TO MULTIPLE TESTING EXAMPLE SO WE CAN USE BINOMIAL LIKELIHOOD
-# A person has a cough and flu-like symptoms, and gets a PCR test for COVID-19, which comes back postiive.  What is the likelihood that they actually have COVID-19, as opposed a regular cold or flu?  We can use Bayes' theorem to compute this.  Let's say that the local rate of symptomatic individuals who actually are infected with COVID-19 is 7.4% (as [reported](https://twitter.com/Bob_Wachter/status/1281792549309386752/photo/1) on July 10, 2020 for San Francisco); thus, our prior probability that someone with symptoms actually has COVID-19 is .074.  The RT-PCR test used to identify COVID-19 RNA is highly specific (that is, it very rarelly reports the presence of the virus when it is not present); for our example, we will say that the specificity is 99%.  Its sensitivity is not known, but probably is no higher than 90%.  
+# A person has a cough and flu-like symptoms, and gets a PCR test for COVID-19, which comes back postiive.  What is the likelihood that they actually have COVID-19, as opposed to a regular cold or flu?  We can use Bayes' theorem to compute this.  Let's say that the local rate of symptomatic individuals who actually are infected with COVID-19 is 7.4% (as [reported](https://twitter.com/Bob_Wachter/status/1281792549309386752/photo/1) on July 10, 2020 for San Francisco); thus, our prior probability that someone with symptoms actually has COVID-19 is .074.  The RT-PCR test used to identify COVID-19 RNA is highly specific (that is, it very rarelly reports the presence of the virus when it is not present); for our example, we will say that the specificity is 99%.  Its sensitivity is not known, but probably is no higher than 90%.  
 # First let's look at the probability of disease given a single positive test.
 
 # %%
@@ -36,6 +36,8 @@
 
 
 
+
+
 # %% [markdown]
 # The high specificity of the test, along with the relatively high base rate of the disease, means that most people who test positive actually have the disease. 
 # Now let's plot the posterior as a function of the prior.  Let's first create a function to compute the posterior, and then apply this with a range of values for the prior.

diff --git a/notebooks/13-GeneralLinearModel.ipynb b/notebooks/13-GeneralLinearModel.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# The General Linear Model in R\n",
+    "# The General Linear Model\n",
     "In this chapter we will explore how to fit general linear models in Python.  We will focus on the tools provided by the `statsmodels` package."
    ]
   },
@@ -134,7 +134,7 @@
     "import seaborn as sns\n",
     "import scipy.stats\n",
     "\n",
-    "scipy.stats.probplot(ols_result.resid, plot=sns.mpl.pyplot)"
+    "_ = scipy.stats.probplot(ols_result.resid, plot=sns.mpl.pyplot)"
    ]
   },
   {

diff --git a/notebooks/13-GeneralLinearModel.py b/notebooks/13-GeneralLinearModel.py
@@ -6,15 +6,15 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.4.2
+#       jupytext_version: 1.15.2
 #   kernelspec:
 #     display_name: Python 3
 #     language: python
 #     name: python3
 # ---
 
 # %% [markdown]
-# # The General Linear Model in R
+# # The General Linear Model
 # In this chapter we will explore how to fit general linear models in Python.  We will focus on the tools provided by the `statsmodels` package.
 
 # %%
@@ -95,7 +95,7 @@ def generate_linear_data(slope, intercept,
 import seaborn as sns
 import scipy.stats
 
-scipy.stats.probplot(ols_result.resid, plot=sns.mpl.pyplot)
+_ = scipy.stats.probplot(ols_result.resid, plot=sns.mpl.pyplot)
 
 # %% [markdown]
 # This looks pretty good, in the sense that the residual data points fall very close to the unit line.  This is not surprising, since we generated the data with normally distributed noise.  We should also plot the predicted (or *fitted*) values against the residuals, to make sure that the model does work systematically better for some predicted values versus others.

diff --git a/notebooks/index.md b/notebooks/index.md
@@ -23,7 +23,7 @@ I apologize up front that the datasets are heavily US-centric.  This is primaril
 
 This book is meant to be a living document, which is why its source is available online at [https://github.com/statsthinking21/statsthinking21-python](https://github.com/statsthinking21/statsthinking21-python).  If you find any errors in the book or want to make a suggestion for how to improve it, please open an issue on the Github site. Even better, submit a pull request with your suggested change.  
 
-The book is licensed according to the [Creative Commons Attribution 2.0 Generic (CC BY 2.0) License](https://creativecommons.org/licenses/by/2.0/).  Please see the terms of that license for more details. 
+This book is licensed using the [Creative Commons Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)  License](https://creativecommons.org/licenses/by-nc/4.0/).  Please see the terms of that license for more details. 
 
 ## Acknowledgements