Skip to content

Commit

Permalink
Merge pull request #16 from UBC-MDS/eda
Browse files Browse the repository at this point in the history
create eda r script
  • Loading branch information
BruhatMusunuru authored Mar 13, 2021
2 parents 896654a + 22204f3 commit b5004b0
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 3 deletions.
34 changes: 32 additions & 2 deletions R/eda.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,42 @@
#' @export
#'
#' @examples
#' result <- eda(mtcars)
#' df <- data.frame(num1 = c(8.5, 8, 9.2, 9.1, 9.4),
#' num2 = c(0.88, 0.93, 0.95 , 0.92 , 0.91),
#' num3 = c(0.46, 0.78, 0.66, 0.69, 0.52),
#' num4 = c(0.082, 0.078, 0.082, 0.085, 0.066),
#' cat1 = c("Good","Okay","Excellent","Terrible","Good"),
#' target = c(2,2,3,1,3))
#' result <- eda(df,"target")
#' nb_num_feat <- result$nb_num_features
#' nb_cat_feat <- result$nb_cat_features
#' ls_cat_feat <- result$cat_features
#' ls_num_feat <- result$num_features
#' nb_class<- result$nb_classes
#' pair_plot <- result$pairplot

eda <- function(df, target){
}

results <- vector('list', 7)
names(results) <- c("num_features_name", "cat_features_name","nb_cat_features","nb_num_features","nb_class","missing","pairplot")

if (!is.data.frame(df)) {
stop('filter_wrapper expects a data frame object')
}
if(!(target %in% c(colnames(df)))) {
stop('column name is incorrect')

}

drop <- c(target)
df_fea <- df[,!(names(df) %in% drop)]
results$num_features_name = c(colnames(dplyr::select_if(df_fea, is.numeric)))
results$cat_features_name = c(colnames(df_fea[!(names(df_fea) %in% results$num_features_name)]))
results$nb_num_features = length(results$num_features_name)
results$nb_cat_features = length(results$cat_features_name)
results$nb_class = sort(unique(df[,target]))
results$missing = tibble::as_tibble(purrr::map(purrr::map_df(df_fea,is.na),sum))
results$pairplot = GGally::ggpairs(dplyr::select_if(df_fea, is.numeric))
return (results)

}
8 changes: 7 additions & 1 deletion man/eda.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

63 changes: 63 additions & 0 deletions tests/testthat/test-eda.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
df <- data.frame(num1 = c(8.5, 8, 9.2, 9.1, 9.4),
num2 = c(0.88, 0.93, 0.95 , 0.92 , 0.91),
num3 = c(0.46, 0.78, 0.66, 0.69, 0.52),
num4 = c(0.082, 0.078, 0.082, 0.085, 0.066),
cat1 = c("Good","Okay","Excellent","Terrible","Good"),
target = c(2,2,3,1,3))
ls <- c(1,2,3,4,5)

test_that('Plot should use gg matrix and the numerical column only', {
p <- eda(df,"target")$pairplot
expect_true("ggmatrix" %in% c(class(p)))
expect_true(eda(df,"target")$nb_num_features == p$ncol)
})

test_that('Check attributes', {
res <- eda(df,"target")
expect_true("pairplot" %in% names(res))
expect_true("nb_num_features" %in% names(res))
})

test_that('Test the length of numerical features', {
res <- eda(df,"target")
expect_true(res$nb_num_features == 4)
})

test_that('Test the length of categorical features', {
res <- eda(df,"target")
expect_true(res$nb_cat_features == 1)
})

test_that('Check numerical features names', {
res <- eda(df,"target")
expect_true("num1" %in% res$num_features_name)
})

test_that('Check categorical features names', {
res <- eda(df,"target")
expect_true("cat1" %in% res$cat_features_name)
})

test_that('Check class labels', {
res <- eda(df,"target")
expect_equivalent(res$nb_class, c(1,2,3))
})

test_that('Missing values',{
res <- eda(df,"target")
expect_equivalent(res$missing,
data.frame(num1 = c(0),
num2 = c(0),
num3 = c(0),
num4 = c(0),
cat1 = c(0)))
})

# handle exception
test_that('Check if user provides a data frame', {
expect_error(eda(ls,"target"))
})

test_that('Check if user provides a correct column name', {
expect_error(eda(df,"targ"))
})

0 comments on commit b5004b0

Please sign in to comment.