-
Notifications
You must be signed in to change notification settings - Fork 0
/
logistic_regression.R
93 lines (75 loc) · 3.05 KB
/
logistic_regression.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
df <- read.csv('/path/to/db')
nrow(df)
dimnames(df)
# Logistic Regression
# Exclude the M-stage
# exclude <- list("M")
# df <- df[,!(names(df) %in% exclude)]
#df_na <- na.omit(df)
# Remove the only row with an N3 stage
df <- df[df["N"] != "N3",]
# Predicting
pred_col <- "Outcome"
# Fraction of rows used for training that do not have the outcome
fraction_without_pred <- 0.1
# Data split (fraction for training)
split <- 0.7
# Build the data frames
df_pred <- df[!is.na(df[pred_col]),]
df_no_pred <- df[is.na(df[pred_col]),]
size <- nrow(df_pred)
split_pred <- sort(sample(size, size * split))
df_train_c <- rbind(df_pred[split_pred,], df_no_pred[sample(nrow(df_no_pred), round(nrow(df_no_pred) * fraction_without_pred)), ])
# df_train_c - data with outcome + without outcome (if fraction_without_pred > 0)
# df_train_c_pred - only data with outcome
# df_train - imputation on data with outcome + without outcome (if fraction_without_pred > 0)
# df_valid_c - only data with outcome
# df_valid - imputation on data with outcome
df_train_c <- data.frame(lapply(df_train_c , as.factor))
df_train_c_pred <- data.frame(lapply(df_pred[split_pred,] , as.factor))
df_valid_c <- data.frame(lapply(df_pred[-split_pred,] , as.factor))
df_valid <- data.frame(lapply(df_pred[-split_pred,] , as.factor))
df_train <- mice::complete(mice::mice(df_train_c, m=5), "stacked")
df_valid <- mice::complete(mice::mice(df_valid, m=5), "stacked")
for (row_name in names(df_train)) {
levels(df_valid[[row_name]]) <- levels(df_train[[row_name]])
levels(df_valid_c[[row_name]]) <- levels(df_train[[row_name]])
levels(df_train_c_pred[[row_name]]) <- levels(df_train[[row_name]])
}
#df_f <- data.frame(lapply(df , as.factor))
#df_f <- mice::complete(mice::mice(df_f, m=3), "stacked")
#split <- 0.7
#size <- nrow(df_f)
#split_pred <- sort(sample(size, size * split))
#df_train <- df_f[split_pred,]
#df_valid <- df_f[-split_pred,]
model <- glm(Outcome ~., family=binomial(link='logit'), data=df_train)
summary(model)
#result <- predict(model, newdata = df_train, type="response")
#b_result <- ifelse(result > 0.5, 1, 0)
#mis_error <- mean(b_result != 0)
### Results
# Training
result <- predict(model, newdata = df_train, type="response")
b_result <- ifelse(result > 0.5, 1, 0)
mis_error <- mean(b_result != df_train$Outcome)
roc_object <- roc(df_train$Outcome, result)
auc(roc_object)
# No imputation performed - only rows with Outcome
result <- predict(model, newdata = df_train_c_pred, type="response")
b_result <- ifelse(result > 0.5, 1, 0)
mis_error <- mean(b_result != df_train_c_pred$Outcome)
roc_object <- roc(df_train_c_pred$Outcome, result)
auc(roc_object)
# Validation
result <- predict(model, newdata = df_valid, type="response")
b_result <- ifelse(result > 0.5, 1, 0)
mis_error <- mean(b_result != df_valid$Outcome)
roc_object <- roc(df_valid$Outcome, result)
auc(roc_object)
# No imputation performed
result <- predict(model, newdata = df_valid_c, type="response")
b_result <- ifelse(result > 0.5, 1, 0)
mis_error <- mean(b_result != df_valid$Outcome)
roc_object <- roc(df_valid$Outcome, result)
auc(roc_object)