diff --git a/DESCRIPTION b/DESCRIPTION index db4e6ef..c489710 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,3 +17,10 @@ Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) RoxygenNote: 7.1.1 +Imports: + dplyr, + tidyr, + magrittr +Suggests: + testthat (>= 2.0.0) +Config/testthat/edition: 2 diff --git a/NAMESPACE b/NAMESPACE index 6ae9268..33a0a0a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,2 +1,5 @@ # Generated by roxygen2: do not edit by hand +export(eda) +export(imputation) +export(scaler) diff --git a/R/imputation.R b/R/imputation.R index 3281080..ead0689 100644 --- a/R/imputation.R +++ b/R/imputation.R @@ -1,10 +1,11 @@ #' Imputing Missing Data #' -#' This function will impute missing data in a tibble/dataframe given the chosen method(mean, median, most_frequent) +#' This function will impute missing data in a tibble/dataframe given the chosen method(mean, median) #' #' @param fit_data list #' @param fill_data list #' @param method character +#' @param constant character, default NULL #' #' @return list #' @export @@ -12,5 +13,47 @@ #' @examples #' test_df <- data.frame('a' = c(1,NA,3), 'b' = c(5,6,NA), 'c' = c(NA,1,10)) #' test_df_imputed <- imputation(test_df, test_df, 'mean') -imputation <- function(fit_data, fill_data, method){ +imputation <- function(fit_data, fill_data, method, constant = NULL){ + # check dataframe type + if(!typeof(fit_data) == 'list'){ + stop("Fit Data should be a dataframe object") + } + if(!typeof(fill_data) == 'list'){ + stop('Filled Data should be a dataframe object') + } + # check using existing methods + if(!method %in% c('mean', 'median', 'constant')){ + stop('Method must be one of mean, median, constant') + } + # check size of fill and fit + if(!dim(fit_data)[2] == dim(fill_data)[2]){ + stop('fit_data and fill_data must have equal number of columns') + } + # check whether fit data is all numeric when mean/median is selected as method + if(method %in% c('mean', 'median')){ + if(!sum(sapply(fit_data, is.numeric)) == dim(fit_data)[2]){ + stop('fit_data must contain all numeric values') + } + } + # method = mean + if(method == 'mean'){ + means <- colMeans(fit_data, na.rm = TRUE) + means <- split(means, names(means)) + filled_data <- tidyr::replace_na(fill_data, means) + } + # method = median + else if (method == 'median'){ + medians <- sapply(fit_data, median, na.rm = TRUE ) + medians <- split(medians, names(medians)) + filled_data <- tidyr::replace_na(fill_data, medians) + } + # method = constant + else{ + filled_data <- fill_data + filled_data[is.na(filled_data)] = constant + } + filled_data + + + } diff --git a/man/eda.Rd b/man/eda.Rd new file mode 100644 index 0000000..fc61bd1 --- /dev/null +++ b/man/eda.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/eda.R +\name{eda} +\alias{eda} +\title{Run a quick EDA analysis} +\usage{ +eda(df, target) +} +\arguments{ +\item{df}{dataframe} + +\item{target}{character} +} +\value{ +(list) List that contains statistical summaries and a pairplot. +} +\description{ +Run a quick EDA analysis +} +\examples{ +result <- eda(mtcars) +nb_num_feat <- result$nb_num_features +nb_cat_feat <- result$nb_cat_features +ls_cat_feat <- result$cat_features +ls_num_feat <- result$num_features +nb_class<- result$nb_classes +pair_plot <- result$pairplot +} diff --git a/man/imputation.Rd b/man/imputation.Rd new file mode 100644 index 0000000..29ceb45 --- /dev/null +++ b/man/imputation.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/imputation.R +\name{imputation} +\alias{imputation} +\title{Imputing Missing Data} +\usage{ +imputation(fit_data, fill_data, method, constant = NULL) +} +\arguments{ +\item{fit_data}{list} + +\item{fill_data}{list} + +\item{method}{character} + +\item{constant}{character, default NULL} +} +\value{ +list +} +\description{ +This function will impute missing data in a tibble/dataframe given the chosen method(mean, median) +} +\examples{ +test_df <- data.frame('a' = c(1,NA,3), 'b' = c(5,6,NA), 'c' = c(NA,1,10)) +test_df_imputed <- imputation(test_df, test_df, 'mean') +} diff --git a/man/scaler.Rd b/man/scaler.Rd new file mode 100644 index 0000000..60e5b69 --- /dev/null +++ b/man/scaler.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/scaler.R +\name{scaler} +\alias{scaler} +\title{Scaling a dataframe} +\usage{ +scaler(X_train, X_Valid, X_test, scale_features, scaler_type) +} +\arguments{ +\item{X_train}{data.frame} + +\item{X_Valid}{data.frame} + +\item{X_test}{data.frame} + +\item{scale_features}{character vector} + +\item{scaler_type}{character} +} +\value{ +data.frame of data.frames +} +\description{ +This function scales numerical features based on scaling requirement in a data.frame +} +\examples{ +X_train<- data.frame('name' = c('pandaman', 'doorman', 'eve'), 'age' = c(15,20,25), 'networth' = c(100000,100,100000)) +X_Valid<- data.frame('name' = c('pandaman_v', 'doorman_v', 'eve_v'), 'age' = c(15,56, 43) , 'networth' = c(123124, 352334,645645) +X_test <- data.frame('name' = c('pandaman_t', 'doorman_t', 'eve_t'), 'age' =c(14,15,56), 'networth' = c(123124,90914, 124124) +scaled_df <- scaler(X_train, X_Valid, X_test, c('age','networth'), scaler_type='standardscaler') +} diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 0000000..82afbe4 --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,4 @@ +library(testthat) +library(prepropyr) + +test_check("prepropyr") diff --git a/tests/testthat/test-imputation.R b/tests/testthat/test-imputation.R new file mode 100644 index 0000000..df8517b --- /dev/null +++ b/tests/testthat/test-imputation.R @@ -0,0 +1,36 @@ +df_1 <- data.frame('a' = c(NA, 4, 10), 'b' = c(2, NA, 5), 'c' = c(3,6,9)) +df_1_t <- data.frame('a' = c(7, 4, 10), 'b' = c(2, 3.5, 5), 'c' = c(3,6,9)) +df_2 <- data.frame('a' = c(NA, 4, 10, 3), 'b' = c(2, NA, 5, 15), 'c' = c(3, 6, 9, 17)) +df_2_t <- data.frame('a' = c(4.0, 4, 10, 3), 'b' = c(2, 5.0, 5, 15), 'c' = c(3, 6, 9, 17)) +df_3 <- data.frame('a' = c(NA, 'd', 'd'), 'b' = c('b', NA, 'x'), 'c' = c('c','f',NA)) +df_3_t <- data.frame('a' = c('test', 'd', 'd'), 'b' = c('b','test', 'x'), 'c' = c('c','f','test')) +df_m <- data.frame('a' = c(NA, 4, 10), 'b' = c('a', NA, 'b'), 'c' = c(3,6,9)) +df_m_t <- data.frame('a' = c('test', 4, 10), 'b' = c('a', 'test', 'b'), 'c' = c(3,6,9)) + +test_that("imputation returns output as expected given different methods", { + # test mean + expect_equal(imputation(df_1, df_1, 'mean'), df_1_t) + # test median + expect_equal(imputation(df_2, df_2, 'median'), df_2_t) + # test for constant + expect_equal(imputation(df_3, df_3, 'constant', 'test'), df_3_t) + # test for mean imputation when no NAN values + expect_equal(imputation(df_1_t, df_1_t, 'mean'), df_1_t) + # test for constant imputation when dataframe has mixed data type columns + expect_equal(imputation(df_m, df_m, 'constant', 'test'), df_m_t) + # test for dataframe when there is only a single value + expect_equal(imputation(data.frame('a' = c(1)), data.frame('a' = c(1)), 'mean'), data.frame('a' = c(1))) +}) + +test_that('tests whether exceptions will raise as expected', { + # test for entering non data.frame() into the function as fit_data + expect_error(imputation(1,df_1, 'mean')) + # test for entering non data.frame() into the function as fill_data + expect_error(imputation(df_1,1, 'mean')) + # test for using non existing method + expect_error(imputation(df_1, df_1, 'test')) + # test for imputation when number of columns of fit_data doesn't equal to fill_data + expect_error(imputation(df_1, data.frame('a' = c(1)), 'mean')) + # test for whether fit data is all numeric when mean/median is selected as method + expect_error(imputation(df_3, df_3, 'mean')) +})