-
Notifications
You must be signed in to change notification settings - Fork 0
/
Project.R
236 lines (197 loc) · 9.36 KB
/
Project.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
###########################################################
# STAT 35500 - Project #
# Analysis of COVID 19 across different States in the USA #
# By - Shivam Bairoliya #
# The .csv file should be in the same directory as the #
# R project. It can be downloaded from #
# https://www.kaggle.com/nightranger77/covid19-state-data #
###########################################################
#Read the data from the CSV file
full_data = read.csv("COVID19_state.csv")
#Extract all the relevant columns
data = full_data[, c("State", "Tested", "Infected", "Deaths", "Pop.Density",
"Smoking.Rate", "Pollution", "Temperature")]
#Create a column for Case Fatality Ratio
data$Case.Fatality = round((data$Deaths/data$Infected)*100, 3)
#Create a column for Positivity Rate
data$Positivity.Rate = round((data$Infected/data$Tested)*100, 3)
#Create a column for ICU Beds per 10000
data$ICU.Beds.10k = round((full_data$ICU.Beds/full_data$Population)*10000, 3)
#Create a column for Physicians per 10000
data$Physicians.10k = round((full_data$Physicians/full_data$Population)*10000, 3)
#Manually create a vector with Political Affiliations of the governor
data$Governors = c("R", "R", "R", "R", "D", "D", "D", "D", "D", "R", "R", "D", "R",
"D", "R", "R", "D", "D", "D", "D", "R", "R", "D", "D", "R", "R",
"D", "R", "D", "R", "D", "D", "D", "D", "R", "R", "R", "D", "D",
"D", "R", "R", "R", "R", "R", "R", "D", "D", "R", "D", "R")
attach(data)
#Display the data being used
head(data)
#Adjust the graph window to fit all the labels
par(mar = c(6.5, 6.5, 2, 0.5), mgp = c(5, 1, 0))
#Mean and Standard Deviation for Total Tests
mean(Tested)
sd(Tested)
#The bar plot for Total Tests
barplot(Tested, names.arg = State, las = 2, cex.names = 0.70,
col = "light blue", ylab = "Tested", xlab = "State",
ylim = c(0,2e07), main = "Total Tests Conducted")
#Mean and Standard Deviation for total Infected
mean(Infected)
sd(Infected)
#The bar plot for Infected
barplot(Infected, names.arg = State, las = 2, cex.names = 0.70,
col = "red", ylab = "Infected", xlab = "State",
ylim = c(0,1e06), main = "Total Infected")
#Mean and Standard Deviation for Total Deaths
mean(Deaths)
sd(Deaths)
#The bar plot for Deaths
barplot(Deaths, names.arg = State, las = 2, cex.names = 0.70,
col = "black", ylab = "Deaths", xlab = "State",
ylim = c(0,28000), main = "Total Deaths")
#Mean and Standard Deviation for Population density
mean(Pop.Density)
sd(Pop.Density)
#The bar plot for population density
barplot(Pop.Density, names.arg = State, las = 2, cex.names = 0.70,
col = "turquoise3", ylab = "Population Density", xlab = "State",
ylim = c(0, 4000), main = "Population Density")
#Mean and Standard Deviation for Smoking Rate
mean(Smoking.Rate)
sd(Smoking.Rate)
#The bar plot for Smoking Rate
barplot(Smoking.Rate, names.arg = State, las = 2, cex.names = 0.70,
col = "gray50", ylab = "Smoking Rate", xlab = "State",
ylim = c(0, 27), main = "Percentage of Pouplation that Smokes")
#Mean and Standard Deviation for Pollution
mean(Pollution)
sd(Pollution)
#The bar plot for Pollution
barplot(Pollution, names.arg = State, las = 2, cex.names = 0.70,
col = "gray23", ylab = "Pollution", xlab = "State",
ylim = c(0, 13), main = "Pollution in microns per cubic meter")
#Mean and Standard Deviation for Temperature
mean(Temperature)
sd(Temperature)
#The bar plot for Temperature
barplot(Temperature, names.arg = State, las = 2, cex.names = 0.70,
col = heat.colors(80), ylab = "Temperature", xlab = "State",
ylim = c(0, 75), main = "Average Temperature in Farenheit")
#Mean and Standard Deviation for Case Fatality Ratio
mean(Case.Fatality)
sd(Case.Fatality)
#The bar plot for Case Fatality Ratio
barplot(Case.Fatality, names.arg = State, las = 2, cex.names = 0.70,
col = "gray10", ylab = "Case Fatality", xlab = "State",
ylim = c(0, 7), main = "Percentage of People who died vs People who
tested +ve for the virus")
#Mean and Standard Deviation for Positivity Rate
mean(Positivity.Rate)
sd(Positivity.Rate)
#The bar plot for Positivity Rate
barplot(Positivity.Rate, names.arg = State, las = 2, cex.names = 0.70,
col = "orangered", ylab = "Positivity Rate", xlab = "State",
ylim = c(0, 19), main = "Positivity Rate as percentage")
#Mean and Standard Deviation for ICU Beds per 10000 of population
mean(ICU.Beds.10k)
sd(ICU.Beds.10k)
#The bar plot for ICU Beds per 10000 of population
barplot(ICU.Beds.10k, names.arg = State, las = 2, cex.names = 0.70,
col = "orchid", ylab = "ICU beds per 10k population", xlab = "State",
ylim = c(0, 4.5), main = "Number of ICU beds per 10000 of the population")
#Mean and Standard Deviation for Physicians per 10000 of population
mean(Physicians.10k)
sd(Physicians.10k)
#The bar plot for Physicians per 10000 of population
barplot(Physicians.10k, names.arg = State, las = 2, cex.names = 0.70,
col = "pink", ylab = "Physicians per 10k population", xlab = "State",
main = "Number of Physicians per 10000 of the population")
#Number of Democrat Governors/Mayor
length(Governors[Governors == "D"])
#Number of Republican Governors
length(Governors[Governors == "R"])
#Reset the plot window
par(mar = c(5, 4, 4, 2) + 0.1, mgp = c(3, 1, 0))
#Adjust the graph window for multiple boxplots
par(mfrow = c(1, 3))
#Boxplot of Tested, Infected and Deaths
boxplot(Tested, col = "light blue", main = "Total Tested", ylab = "Tested", cex = 1.5)
boxplot(Infected, col = "red", main = "Total Tested Positive",
ylab = "Tested +ve", cex = 1.5)
boxplot(Deaths, col ="gray20", main = "Total Deaths", ylab ="Deaths", cex = 1.5)
#Boxplot of Tested, Infected and Deaths
boxplot(Pop.Density, col = "turquoise3", main = "Population Density",
ylab = "Pouplation Density", cex = 1.5)
boxplot(Case.Fatality, col = "gray30", main = "Case Fatality",
ylab = "Case Fatality", cex = 1.5)
boxplot(Positivity.Rate, col ="orangered", main = "Positivity Rate",
ylab ="Positivity Rate", cex = 1.5)
#Boxplot of Smoking Rate, Pollution, Temperature
boxplot(Smoking.Rate, col = "gray50", main = "Smoking Rate",
ylab = "Smoking Rate", cex = 1.5)
boxplot(Pollution, col = "gray20", main = "Pollution",
ylab = "Pollution", cex = 1.5)
boxplot(Temperature, col ="orangered3", main = "Temperature",
ylab ="Temperature", cex = 1.5)
#Adjust the window for two Boxplots
par(mfrow = c(1, 2))
#Boxplot of ICU Beds per 10k and Physicians per 10k
boxplot(ICU.Beds.10k, col = "orchid", main = "ICU Beds per 10k",
ylab = "SICU Beds per 10k", cex = 1.5)
boxplot(Physicians.10k, col = "pink", main = "Physicians per 10k",
ylab = "Physicians per 10k", cex = 1.5)
#Reset the graph window
par(mfrow = c(1, 1))
#Question 1
#Is there a significant difference between the mean of Positivity Rate of the
#Republican Controlled States and the Democratic Controlled States?
#Method used - two-sample independent unpaired t-test
#Checking for assumptions
#qqplot of the republican states
Republican.Pos.Rate = Positivity.Rate[Governors == "R"]
qqnorm(Republican.Pos.Rate, col = "red", pch = 16,
main = "Normal Q-Q Plot of Republican States")
qqline(Republican.Pos.Rate, lwd = 3)
#qqplot of the democratic states
Democratic.Pos.Rate = Positivity.Rate[Governors == "D"]
qqnorm(Democratic.Pos.Rate, col = "blue", pch = 16,
main = "Normal Q-Q Plot of Democratic States")
qqline(Democratic.Pos.Rate, lwd = 3)
#Performing the test
#Null hypothesis(H0) - There is no difference in means
#Alternate hypothesis - There is a difference in means
t.test(x = Republican.Pos.Rate, y = Democratic.Pos.Rate,
alternative = "two.sided", paired = FALSE)
#p-value < alpha Reject H0 and accept H1
#This indicates that the positivity rate of the Republican states is
#higher than the positivity rate of the Democratic states
#Question 2
#Is there a relationship between the case fatality ratio and the number of ICU
#beds and Physicians relative to the population and is the interaction significant,
#if yes then how much of the variability is explained by these factors?
#Method used - Multiple Regression
#Making the linear Model
lmodel = lm(Case.Fatality~Physicians.10k*ICU.Beds.10k)
#Checking Assumptions
plot(lmodel)
#Analyzing the results and looking at the Anova table
summary(lmodel)
#p-value < alpha Reject H0 and accept H1
anova(lmodel)
#This is an indication that there is some correlation between ICU beds,
#Physicians, and the interaction is significant. Although the slope cannot be
#interpreted easily we can see there is a relation.
#Question 3
#Is there a correlation between the Positivity rate the average Temperature
#of the state?
#Make a linear model
temperatureModel = lm(Positivity.Rate~Temperature)
#Check Assumptions
plot(temperatureModel)
#View a plot
plot(Positivity.Rate~Temperature, pch = 16, col = "orangered")
cor.test(Positivity.Rate, Temperature, alternative = "two.sided")
#p-value > alpha accept H0
#In the end, we can conclude that the Positivity Rate is related to the
#average temperature of the state.