From 22204f3bc9c9177763e76f517248be75d2d5a93b Mon Sep 17 00:00:00 2001 From: pan1fan2 Date: Sat, 13 Mar 2021 22:12:54 +0800 Subject: [PATCH] create eda r script --- R/eda.R | 34 +++++++++++++++++++-- man/eda.Rd | 8 ++++- tests/testthat/test-eda.R | 63 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 102 insertions(+), 3 deletions(-) create mode 100644 tests/testthat/test-eda.R diff --git a/R/eda.R b/R/eda.R index baed54e..bd3d6c0 100644 --- a/R/eda.R +++ b/R/eda.R @@ -7,12 +7,42 @@ #' @export #' #' @examples -#' result <- eda(mtcars) +#' df <- data.frame(num1 = c(8.5, 8, 9.2, 9.1, 9.4), +#' num2 = c(0.88, 0.93, 0.95 , 0.92 , 0.91), +#' num3 = c(0.46, 0.78, 0.66, 0.69, 0.52), +#' num4 = c(0.082, 0.078, 0.082, 0.085, 0.066), +#' cat1 = c("Good","Okay","Excellent","Terrible","Good"), +#' target = c(2,2,3,1,3)) +#' result <- eda(df,"target") #' nb_num_feat <- result$nb_num_features #' nb_cat_feat <- result$nb_cat_features #' ls_cat_feat <- result$cat_features #' ls_num_feat <- result$num_features #' nb_class<- result$nb_classes #' pair_plot <- result$pairplot + eda <- function(df, target){ -} \ No newline at end of file + + results <- vector('list', 7) + names(results) <- c("num_features_name", "cat_features_name","nb_cat_features","nb_num_features","nb_class","missing","pairplot") + + if (!is.data.frame(df)) { + stop('filter_wrapper expects a data frame object') + } + if(!(target %in% c(colnames(df)))) { + stop('column name is incorrect') + + } + + drop <- c(target) + df_fea <- df[,!(names(df) %in% drop)] + results$num_features_name = c(colnames(dplyr::select_if(df_fea, is.numeric))) + results$cat_features_name = c(colnames(df_fea[!(names(df_fea) %in% results$num_features_name)])) + results$nb_num_features = length(results$num_features_name) + results$nb_cat_features = length(results$cat_features_name) + results$nb_class = sort(unique(df[,target])) + results$missing = tibble::as_tibble(purrr::map(purrr::map_df(df_fea,is.na),sum)) + results$pairplot = GGally::ggpairs(dplyr::select_if(df_fea, is.numeric)) + return (results) + +} diff --git a/man/eda.Rd b/man/eda.Rd index fc61bd1..1765c94 100644 --- a/man/eda.Rd +++ b/man/eda.Rd @@ -18,7 +18,13 @@ eda(df, target) Run a quick EDA analysis } \examples{ -result <- eda(mtcars) +df <- data.frame(num1 = c(8.5, 8, 9.2, 9.1, 9.4), + num2 = c(0.88, 0.93, 0.95 , 0.92 , 0.91), + num3 = c(0.46, 0.78, 0.66, 0.69, 0.52), + num4 = c(0.082, 0.078, 0.082, 0.085, 0.066), + cat1 = c("Good","Okay","Excellent","Terrible","Good"), + target = c(2,2,3,1,3)) +result <- eda(df,"target") nb_num_feat <- result$nb_num_features nb_cat_feat <- result$nb_cat_features ls_cat_feat <- result$cat_features diff --git a/tests/testthat/test-eda.R b/tests/testthat/test-eda.R new file mode 100644 index 0000000..822bc37 --- /dev/null +++ b/tests/testthat/test-eda.R @@ -0,0 +1,63 @@ +df <- data.frame(num1 = c(8.5, 8, 9.2, 9.1, 9.4), + num2 = c(0.88, 0.93, 0.95 , 0.92 , 0.91), + num3 = c(0.46, 0.78, 0.66, 0.69, 0.52), + num4 = c(0.082, 0.078, 0.082, 0.085, 0.066), + cat1 = c("Good","Okay","Excellent","Terrible","Good"), + target = c(2,2,3,1,3)) +ls <- c(1,2,3,4,5) + +test_that('Plot should use gg matrix and the numerical column only', { + p <- eda(df,"target")$pairplot + expect_true("ggmatrix" %in% c(class(p))) + expect_true(eda(df,"target")$nb_num_features == p$ncol) +}) + +test_that('Check attributes', { + res <- eda(df,"target") + expect_true("pairplot" %in% names(res)) + expect_true("nb_num_features" %in% names(res)) +}) + +test_that('Test the length of numerical features', { + res <- eda(df,"target") + expect_true(res$nb_num_features == 4) +}) + +test_that('Test the length of categorical features', { + res <- eda(df,"target") + expect_true(res$nb_cat_features == 1) +}) + +test_that('Check numerical features names', { + res <- eda(df,"target") + expect_true("num1" %in% res$num_features_name) +}) + +test_that('Check categorical features names', { + res <- eda(df,"target") + expect_true("cat1" %in% res$cat_features_name) +}) + +test_that('Check class labels', { + res <- eda(df,"target") + expect_equivalent(res$nb_class, c(1,2,3)) +}) + +test_that('Missing values',{ + res <- eda(df,"target") + expect_equivalent(res$missing, + data.frame(num1 = c(0), + num2 = c(0), + num3 = c(0), + num4 = c(0), + cat1 = c(0))) +}) + +# handle exception +test_that('Check if user provides a data frame', { + expect_error(eda(ls,"target")) +}) + +test_that('Check if user provides a correct column name', { + expect_error(eda(df,"targ")) +})