-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path.Rhistory
142 lines (142 loc) · 4.8 KB
/
.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
library(zoo)
# import data
path = './Output/impute.csv'
data = read.csv(path, header=TRUE, sep=",")
data = subset(data, select=-X)
days = nrow(data)
measurements = ncol(data)
names(data) = 1:measurements
#visualization (while engineering)
# a) local polynomial regression
x = 1:measurements
#START = 200
START = 42
par(mfrow=c(3, 3))
for (i in START:(START+8)) {
y = unlist(data[i, ])
#local.poly = loess(y ~ x, span=0.1)
local.poly = loess(y ~ x, deg=1, span=0.05)
x.test = x[is.na(y)]
y.pred = predict(local.poly, newdata=x.test)
yy = y
yy[is.na(y)] = y.pred
plot(x, yy, col="green", type="l", lwd=2)
lines(x, y, lwd=2)
(imputed_values = sum(!is.na(y.pred)))
}
# b) interpolation
par(mfrow=c(3, 3))
# Interaction fatigue <-> physiological data
# 1. Read data
path = './Output/'
data.mean = read.csv(paste(path, "combined_data_mean.csv", sep=""), header=TRUE, sep=",")
str(data.mean)
n = nrow(data.mean)
p = 12
variables = names(dat)[1:p]
# factor variables
data.mean$VAS = as.factor(data.mean$VAS)
levels(data.mean$VAS)
levels(data.mean$VAS) = c("vigilant", "fatigued")
data.mean$MF = as.factor(data.mean$MF)
levels(data.mean$MF)
levels(data.mean$MF) = c("vigilant", "fatigued")
data.mean$phF = as.factor(data.mean$phF)
levels(data.mean$phF)
levels(data.mean$phF) = c("vigilant", "fatigued")
data.mean$ReIP = as.factor(data.mean$ReIP)
levels(data.mean$ReIP)
levels(data.mean$ReIP) = c("worse", "same", "better")
# TODO: mean of ActivitiyClass (a factor var.) is meaningless -> change! (to most common value)
unique(data.mean$ActivityClass)
data.mean$ActivityClass = as.factor(data.mean$ActivityClass)
levels(data.mean$ActivityClass)
levels(data.mean$ActivityClass) = c("undefined", "resting", "other", "biking", "running", "walking")
# remove metadata
dat = subset(data.mean, select=c(-X, -subjectID, -date, -sport, -n_answers, -timezone))
# TODO: include ActivityClass
p = 11
variables = names(dat)[1:p]
dat = subset(dat, select=-ActivityClass)
str(dat)
# 2. Statistical Analysis
library(ggplot2)
library(GGally)
library(reshape)
library(plot.matrix)
melted = melt(dat)
# intra-subject variability
dat2 = subset(data.mean, select=c(-X, -date, -sport, -n_answers, -timezone, -ActivityClass))
dat2$subjectID = as.factor(dat2$subjectID)
melted2 = melt(dat2)
ggplot(data=melted2, aes(x=variable, y=value)) +
geom_boxplot(aes(fill=subjectID)) +
facet_wrap(~variable, scales="free")
# Shapiro-Wilk test
p.values = matrix(nrow=27, ncol=p)
sample.size = matrix(nrow=27, ncol=p)
for (subj in 1:27) {
for (var in 1:p) {
variable = names(data.mean)[var + 3]
data.subj = data.mean[, variable][data.mean$subjectID == subj]
sample.size[subj, var] = length(na.omit(data.subj))
}
}
sample.size
for (subj in 1:27) {
for (var in 1:p) {
variable = names(data.mean)[var + 3]
data.subj = data.mean[, variable][data.mean$subjectID == subj]
p.values[subj, var] = ifelse(sample.size[subj, var] < 3, NA, shapiro.test(data.subj)$p.value)
}
}
p.values
alpha = 0.05
result = p.values < alpha # H0: data is normally distributed -> if p < alpha: reject H0
colnames(result) = names(data.mean)[4:(3+p)]
plot(result, col=c("green", "red"), las=2, xlab="Variable", ylab="Subject",
main="Shapiro-Wilk test (red: reject H0 (normality assumption) under a = 0.05)")
# questionnaires per subject
y = numeric(27)
for (i in 1:27) {
y[i] = sum(dat2$subjectID == i)
}
x = 1:27
table(data.mean$subjectID) # treats as categorical data with multiple draws
barplot(table(data.mean$subjectID),
main="Days with filled out questionnaires",
xlab="subject",
ylab="days")
sort(table(data.mean$subjectID))[23:27] # top 4 âcontributorsâ
(sum(sort(table(data.mean$subjectID))[23:27]) / n) * 100 # 4/27 give data for 65%
# VAS
sub = table(data.mean$VAS, data.mean$subjectID) # with fatigue labels
barplot(sub,
main="Days with filled out questionnaires (fatigue: VAS)",
xlab="subject",
ylab="days",
col=c("blue", "red"))
legend("topleft", legend=rownames(sub), fill=c("blue", "red"))
apply(sub, MARGIN=1, FUN=sum) # ratio vigilant/fatigued
apply(sub, MARGIN=1, FUN=sum) / sum(sub) # percentage vigilant/fatigued
apply(sub, MARGIN=1, FUN=sum) / sum(sub) # percentage vigilant/fatigued
# phF
sub = table(data.mean$phF, data.mean$subjectID) # with fatigue labels
barplot(sub,
main="Days with filled out questionnaires (fatigue: phF)",
xlab="subject",
ylab="days",
col=c("blue", "red"))
legend("topleft", legend=rownames(sub), fill=c("blue", "red"))
apply(sub, MARGIN=1, FUN=sum) # ratio vigilant/fatigued
apply(sub, MARGIN=1, FUN=sum) / sum(sub) # percentage vigilant/fatigued
# MF
sub = table(data.mean$MF, data.mean$subjectID) # with fatigue labels
barplot(sub,
main="Days with filled out questionnaires (fatigue: MF)",
xlab="subject",
ylab="days",
col=c("blue", "red"))
legend("topleft", legend=rownames(sub), fill=c("blue", "red"))
apply(sub, MARGIN=1, FUN=sum) # ratio vigilant/fatigued
apply(sub, MARGIN=1, FUN=sum) / sum(sub) # percentage vigilant/fatigued