-
Notifications
You must be signed in to change notification settings - Fork 1
/
mixed_aov_v2.R
332 lines (292 loc) · 14.9 KB
/
mixed_aov_v2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
load("C:/Users/siddh/R/Projects/hourly_data/P70_10days_merged_Clean (1).Rda")
# Please note current script is tailored toward specific DV and IV values for a current Mirzadeh lab project
# Changes would have to be made to ensure your variables are the subject of the script
# The script is designed such that each "section" of the script CREATES and then RUNS "mini-functions" that address parts of the ANOVA process
## Library Importing
library(tidyverse)
library(dplyr)
library(magrittr)
library(ggrepel)
library(ggpubr)
library(rstatix)
library(readxl)
library(scales)
# User Input: Step 1
data_new <- as_tibble(df.hourly)
## Running this line will package the entire script into one function.
# You instead have the option of manually "creating" and running each function for assumptions, ANOVA, post hoc, and visualization
anova_general <- function(data_new) {
## This is something to consider!! Take NOTE: THE FACTOR CONVERSION IS EXTREMELY SPECIFIC
# Step 2 - Assumptions:
# Assumption: Outlier Check
# Below Function tests for outliers.
# If outliers undetected, script proceeds
# If outliers detected, script presents outliers, and user is given option of eliminating or keeping the outlier in their analysis
option_outliers <- function(data_new) {
outliers <- data_new %>%
identify_outliers(FoodIn.cum.kcal)
# Identifies outliers
if (nrow(outliers) > 0) {
# Tests if outliers were produced by seeing if the outliers function produces a size > 0
cat("Outliers detected:\n")
print(outliers)
# Shows outliers
cat("\nOptions:\n")
cat("1. Eliminate detected outliers\n")
cat("2. Keep detected outliers\n")
option <- as.integer(trimws(readline("Enter your choice (1 or 2): ")))
# Offer options for treatment
if (option == 1) {
data_no_outlier <- anti_join(data_new, outliers)
View(data_no_outlier)
assign('data_new', data_no_outlier, envir=.GlobalEnv)
}
# Recreate dataframe without outlier index. It will continue to be called data_new
else if (option == 2) {
message("Script will proceed!")
return(NULL)
# Return original data, keeping outlier in the data without elimination
}
}
else {
message("No outliers, script will proceed!")
return(NULL)
# No outlier pathway
}
}
option_outliers(data_new)
# Assumption: Normality
# Tests normality through 3 possible ways, with descriptions and information in "cat" message presentations
check_normals <- function(data_new) {
cat("\nPlease Select:\n")
cat("1. qqplot, meant for sample size>50 (prone to unreadable visualization if data is too large)\n")
cat("2. Shapiro Test, prone to over-sensitivity in minor deviations for large data\n")
cat("3. Histogram, for subjective self-reference, if data is too large for above options")
choose <- as.integer(trimws(readline("Enter your choice (1, 2, or 3): ")))
if (choose == 1) { # qqplot choice
# Specify faceting
cat("\nPlease Select how you would like your data to be faceted in Time:\n")
cat("1. Facet by unique time points in dataframe (please note too many unique points will create an unreadable plot)\n")
cat("2. If time points are in hours, facet by days. Please note both options may take some time to generate the plot\n")
qqchoose <- as.integer(trimws(readline("Enter your choice (1 or 2): ")))
if (qqchoose == 1) {## qqplot choice 1
p <- ggqqplot(data_new, "FoodIn.cum.kcal", ggtheme = theme_bw()) +
facet_grid(Group~Time)
print(p)
cat("\nPlease see plot printed. Are you satisfied with Normality?:\n")
cat("1. Yes\n")
cat("2. No\n")
normals <- as.integer(trimws(readline("Enter your choice (1 or 2): ")))
if (normals == 1) {
message("Normality Assumption Satisfied")
}
if (normals == 2) {
message("The 2-way Mixed ANOVA is fairly robust even under slight normality violations. Depending on the perceived normality from the qqplot, you also have the option of continuing with this script. Otherwise, alternate statistical analysis may need to be considered")
}
}
if (qqchoose == 2) { ## qqplot choice 2
## Conversion to Days
data_qqtemp <- data_new
data_qqtemp$days <- data_new$Time/24
data_qqtemp$days <- floor(data_qqtemp$days)
p1 <- ggqqplot(data_qqtemp, "FoodIn.cum.kcal", ggtheme = theme_bw()) +
facet_grid(Group~days)
print(p1)
cat("\nPlease see plot printed. Are you satisfied with Normality?:\n")
cat("1. Yes\n")
cat("2. No\n")
normalsqq <- as.integer(trimws(readline("Enter your choice (1 or 2): ")))
if (normalsqq == 1) {
message("Normality Assumption Satisfied")
}
if (normalsqq == 2) {
message("The 2-way Mixed ANOVA is fairly robust even under slight normality violations. Depending on the perceived normality from the qqplot, you also have the option of continuing with this script. Otherwise, alternate statistical analysis may need to be considered")
}
}
}
if (choose == 2) { ## shapiro test choice
c <- shapiro.test(data_new$FoodIn.cum.kcal)
print(c)
cat("\nPlease see p-value printed. A p-value over 0.05 according to the Shapiro test, is statistically significant and deviates from normality. Please keep in mind this test is extremely sensitive to deviation. Are you satisfied with Normality?:\n")
cat("1. Yes\n")
cat("2. No\n")
normals2 <- as.integer(trimws(readline("Enter your choice (1 or 2): ")))
if (normals2 == 1) {
message("Normality Assumption Satisfied")
}
if (normals2 == 2) {
message("The 2-way Mixed ANOVA is fairly robust even under slight normality violations. The shapiro test is extremely sensitive to deviations, especially for larger data sets. You also have the option of continuing with this script. Otherwise, alternate statistical analysis may need to be considered")
}
}
if (choose == 3) { ## basic histogram choice
hist(data_new$FoodIn.cum.kcal, xlab = "Food Intake (kcal)", main="Histogram of Food Intake")
cat("\nPlease see the histogram modelling your data. Are you satisfied with Normality?:\n")
cat("1. Yes\n")
cat("2. No\n")
normals2 <- as.integer(trimws(readline("Enter your choice (1 or 2): ")))
if (normals2 == 1) {
message("Normality Assumption Satisfied")
}
if (normals2 == 2) {
message("The 2-way Mixed ANOVA is fairly robust even under slight normality violations. Depending on the perceived normality from the plot, you also have the option of continuing with this script. Otherwise, alternate statistical analysis may need to be considered")
}
}
}
check_normals(data_new) # Run above function
# Conversion to Factor (do only post-outlier and post-normality)
data_new <- data_new %>%
convert_as_factor(Animal, Time, Group)
# Assumption: Homogeneity of Variance
# The Levene Test is used for checking this
check_variance <- function(data_new) {
levene <- data_new %>%
group_by(Time) %>%
levene_test(FoodIn.cum.kcal ~ Group)
detected_rows <- levene %>%
filter(p < 0.05)
# A value under 0.05 for p is usually an indicator of violation of this assumption
if (nrow(detected_rows) > 0) {
message("Levene Test Violation Detected:")
print(detected_rows)
message("This data does not pass the Levene Homogeneity of Variance Test. Alternate statistical analysis may need to be considered. You also have the option of proceeding.")
} else {
message("Levene Test Assumption Satisfied")
}
}
check_variance(data_new)
# Assumption: Homogeneity of Covariance
# The Box's M Test is used here. Due to the extreme sensitivity of this test, the qualification to pass is p>0.001. Even if data does not meet this low threshold, however, many will continue the ANOVA analysis
check_covariance <- function(data_new){
cov <- box_m(data_new[, "FoodIn.cum.kcal", drop = FALSE], data_new$Group)
print(cov)
if (cov$p.value < 0.001) {
# A value under 0.001 for p is usually an indicator of violation of this assumption
message("This data does not pass the Box's M Homogeneity of Covariance Test. Alternate statistical analysis may need to be considered. Often the violation may just need to be noted, but the Mixed ANOVA test can be run anyway. Additionally, the Greenhouse-Geisser correction will be later applied in this program, which can often help account slightly for this violation.")
}
else {
message("Box's M Test Assumption Satisfied")
}
}
check_covariance(data_new)
# Now, ANOVA test is run and checks for any effects. It also applies corrections (specified below)
run_anova <- function(data_new) {
## Applying Greenhouse-Geisser sphericity correction and Mauchly Test for Sphericity
res.aov <- anova_test(
data = data_new, dv = FoodIn.cum.kcal, wid = Animal,
between = Group, within = Time
)
message("Mauchly Test internally applied in res.aov")
message("get_anova_table: the Greenhouse-Geisser sphericity correction is automatically applied to factors violating the sphericity assumption.")
aov <- get_anova_table(res.aov)
print(aov)
if (aov[1,6] == '*') {
message("Group Main Effect Detected")
}
if (aov[2,6] == '*') {
message("Time Main Effect Detected")
}
if(aov[3,6] == '*') {
message("Interaction Between Group and Time Detected")
}
assign('aov', aov, envir=.GlobalEnv)
return(aov)
# returns an ANOVA dataframe with key details. This can be retrieved later as "aov".
}
run_anova(data_new)
# Post Hoc, seeing Group effects at every time point
treat_maineffect<- function(data_new) {
print("Now running post-hoc tests for signifcance by time point. Depending on data size, this may take slightly longer than expected")
one_way_test <- data_new %>%
group_by(Time) %>%
anova_test(dv = FoodIn.cum.kcal, wid = Animal, between = Group) %>%
get_anova_table() %>%
adjust_pvalue(method = "bonferroni") %>%
as_tibble()
View(one_way_test)
colnames(one_way_test)[7] <- "Significance"
# Find Significance from dataframe. Below line builds a dataframe based only on significance.
# Then a row count is done to check if significance is present
detected_sig <- subset(one_way_test, grepl("\\*", Significance))
print(one_way_test)
if (nrow(detected_sig) > 0) {
message("Significance in Group main effect was detected at certain time points")
message("Please see complete table above to record time points where signfificance was seen")
}
else {
message("No Significance was detected on the dependent variable between groups at any time point")
}
}
treat_maineffect(data_new)
## Pairwise Data frame Construction for Plotting. This function accomplishes the same thing as the post-hoc test above, but does so for the prupose of visualization. This table is also retreivable
make_pairwise <- function(data_new) {
Max_mass <- data_new %>%
group_by(Time) %>%
summarize(max(FoodIn.cum.kcal))
# The above code generates a data frame for maximum DV value to be added to the pairwise function.
# This is so that the mass function can be carried to significant dataframes for plotting of significance indicators later
pw <<- data_new %>%
group_by(Time) %>%
pairwise_t_test(FoodIn.cum.kcal ~ Group, p.adjust.method = "bonferroni") %>%
add_column(yplacement = Max_mass$'max(FoodIn.cum.kcal)')
# The above last line is for adding a coordinate for assisting in placement of significance labels in the pw dataframe
View(pw)
pw <- print(pw)
return(pw)
}
make_pairwise(data_new)
#NOTE: THERE IS POTENTIAL TO REPEAT THIS POST HOC TEST, BUT GROUPED BY BETWEEN VARIABLE AND MEASURING THE EFFECT OF TIME
## Use this pairwise dataframe to generate final boxplot
final_box <- function(pw) {
singlesig <- pw %>%
filter(p.signif == "*") %>%
select(Time, p, yplacement)
## Above and below code blocks are for setting the y-value placement of the asterisk (signficance indicators)
doublesig <- pw %>%
filter(p.signif == "**") %>%
select(Time, p, yplacement)
cat("\nNow making visualization. Please choose interval size for Time (x-axis):\n")
int_chosen <- as.integer(trimws(readline("Enter your choice as a numeric value based on how far apart you would like your time variable spaced: ")))
## Choosing interval size to avoid unreadable x-axis
data_new$Time <- as.numeric(as.character(data_new$Time))
## Conversion to numeric for processing in labeling adjustment
# Choosing F-statistic presentation
cat("\nPlease choose the precise F-statistic type you would like on your plot:\n")
cat("1. Group Main Effect\n")
cat("2. Time Main Effect\n")
cat("3. Group-Time Interaction Effect\n")
effect_type <- as.integer(trimws(readline("Enter your choice as 1, 2, or 3: ")))
if (effect_type == 1) {
f_chosen = 1
Effect_F <- aov$'F'[1]
Effect_F
}
if (effect_type == 2) {
f_chosen = 2
Effect_F <- aov$'F'[2]
Effect_F
}
if (effect_type == 3) {
f_chosen = 3
Effect_F <- aov$'F'[3]
Effect_F
}
fin_bxp <- ggboxplot(
data_new, x = "Time", y = "FoodIn.cum.kcal",
color = "Group", palette = "jco") +
geom_text(x = max(as.numeric(data_new$Time)), y = min(data_new$FoodIn.cum.kcal),
label = sprintf("F(%.2f) = %.3f", aov$DFn[f_chosen], Effect_F),
# The printed value on the plot represents the DF and F statistic value.
# For a breakdown on p-values by time point, please reference the "pw" DF
hjust = 1, vjust = 1, color = "red") +
geom_text(data = singlesig, aes(x=Time, y=yplacement, label = "*"), size = 5, hjust = 0.5, vjust = -0.8, color = "red") +
geom_text(data = doublesig, aes(x=Time, y=yplacement, label = "**"), size = 5, hjust = 0.5, vjust = -0.8, color = "red") +
labs(x = "Time (hours)", y = "Food Intake (kcal)") +
scale_x_discrete(breaks = seq(0, max(data_new$Time), by = int_chosen))+
theme_minimal()
fin_bxp <- fin_bxp + expand_limits(y = 0)
fin_bxp
}
final_box(pw)
}
# Below line is for running the "mega" function which encapsulates each section into one. See above instructions at the top of the script.
anova_general(data_new)