-
Notifications
You must be signed in to change notification settings - Fork 0
/
eda.R
30 lines (25 loc) · 905 Bytes
/
eda.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
library("lattice")
library("stringr")
library("rstudioapi")
current_path <- getActiveDocumentContext()$path
setwd(dirname(current_path))
source("utils.R")
response.var <- 'status_group'
id.var <- 'id'
training.set <- get.data.set(df = "train", id.var)
num.rows <- nrow(training.set)
col.names <- colnames(training.set)
# get freq and proportion tables of response for full training df
response.freqs <- table(training.set[,response.var])
response.props <- prop.table(margin.table(response.freqs, 1))
not.useful.cols <- c("id", "date_recorded", "num_private", response.var)
for(col in col.names) {
if(col %in% not.useful.cols) {next}
if(class(training.set[,col]) == 'factor') {
training.set <- factor.eda.fxn(training.set, col, response.var, num.rows)
}
else {
numeric.eda.fxn(training.set, col, response.var)
}
}
write.csv(training.set, paste("data/full_training_set.csv", sep=""))