From 14767fded188369f4c4f876c2f79e5ec90a8449a Mon Sep 17 00:00:00 2001
From: Jason Chang <ja.chang0628@gmail.com>
Date: Fri, 12 Mar 2021 23:12:31 +0800
Subject: [PATCH 1/2] imputation function completed

---
 DESCRIPTION    |  7 +++++++
 NAMESPACE      |  3 +++
 R/imputation.R | 47 +++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index db4e6ef..c489710 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -17,3 +17,10 @@ Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.1.1
+Imports: 
+    dplyr,
+    tidyr,
+    magrittr
+Suggests: 
+    testthat (>= 2.0.0)
+Config/testthat/edition: 2
diff --git a/NAMESPACE b/NAMESPACE
index 6ae9268..33a0a0a 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,2 +1,5 @@
 # Generated by roxygen2: do not edit by hand
 
+export(eda)
+export(imputation)
+export(scaler)
diff --git a/R/imputation.R b/R/imputation.R
index 3281080..ead0689 100644
--- a/R/imputation.R
+++ b/R/imputation.R
@@ -1,10 +1,11 @@
 #' Imputing Missing Data
 #'
-#' This function will impute missing data in a tibble/dataframe given the chosen method(mean, median, most_frequent)
+#' This function will impute missing data in a tibble/dataframe given the chosen method(mean, median)
 #'
 #' @param fit_data list
 #' @param fill_data list
 #' @param method character
+#' @param constant character, default NULL
 #'
 #' @return list
 #' @export
@@ -12,5 +13,47 @@
 #' @examples
 #' test_df <- data.frame('a' = c(1,NA,3), 'b' = c(5,6,NA), 'c' = c(NA,1,10))
 #' test_df_imputed <- imputation(test_df, test_df, 'mean')
-imputation <- function(fit_data, fill_data, method){
+imputation <- function(fit_data, fill_data, method, constant = NULL){
+  # check dataframe type
+  if(!typeof(fit_data) == 'list'){
+    stop("Fit Data should be a dataframe object")
+  }
+  if(!typeof(fill_data) == 'list'){
+    stop('Filled Data should be a dataframe object')
+  }
+  # check using existing methods
+  if(!method %in% c('mean', 'median', 'constant')){
+    stop('Method must be one of mean, median, constant')
+  }
+  # check size of fill and fit
+  if(!dim(fit_data)[2] == dim(fill_data)[2]){
+    stop('fit_data and fill_data must have equal number of columns')
+  }
+  # check whether fit data is all numeric when mean/median is selected as method
+  if(method %in% c('mean', 'median')){
+    if(!sum(sapply(fit_data, is.numeric)) == dim(fit_data)[2]){
+      stop('fit_data must contain all numeric values')
+    }
+  }
+  # method = mean
+  if(method == 'mean'){
+    means <- colMeans(fit_data, na.rm = TRUE)
+    means <- split(means, names(means))
+    filled_data <- tidyr::replace_na(fill_data, means)
+  }
+  # method = median
+  else if (method == 'median'){
+    medians <- sapply(fit_data, median, na.rm = TRUE )
+    medians <- split(medians, names(medians))
+    filled_data <- tidyr::replace_na(fill_data, medians)
+  }
+  # method = constant
+  else{
+    filled_data <- fill_data
+    filled_data[is.na(filled_data)] = constant
+  }
+  filled_data
+
+
+
 }

From 76f221e55552061eb9ee89a2be728aed42d1177f Mon Sep 17 00:00:00 2001
From: Jason Chang <ja.chang0628@gmail.com>
Date: Fri, 12 Mar 2021 23:51:19 +0800
Subject: [PATCH 2/2] tests for imputation completed

---
 man/eda.Rd                       | 28 +++++++++++++++++++++++++
 man/imputation.Rd                | 27 ++++++++++++++++++++++++
 man/scaler.Rd                    | 31 +++++++++++++++++++++++++++
 tests/testthat.R                 |  4 ++++
 tests/testthat/test-imputation.R | 36 ++++++++++++++++++++++++++++++++
 5 files changed, 126 insertions(+)
 create mode 100644 man/eda.Rd
 create mode 100644 man/imputation.Rd
 create mode 100644 man/scaler.Rd
 create mode 100644 tests/testthat.R
 create mode 100644 tests/testthat/test-imputation.R

diff --git a/man/eda.Rd b/man/eda.Rd
new file mode 100644
index 0000000..fc61bd1
--- /dev/null
+++ b/man/eda.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/eda.R
+\name{eda}
+\alias{eda}
+\title{Run a quick EDA analysis}
+\usage{
+eda(df, target)
+}
+\arguments{
+\item{df}{dataframe}
+
+\item{target}{character}
+}
+\value{
+(list) List that contains statistical summaries and a pairplot.
+}
+\description{
+Run a quick EDA analysis
+}
+\examples{
+result <- eda(mtcars)
+nb_num_feat  <- result$nb_num_features
+nb_cat_feat <- result$nb_cat_features
+ls_cat_feat <- result$cat_features
+ls_num_feat <- result$num_features
+nb_class<- result$nb_classes
+pair_plot <- result$pairplot
+}
diff --git a/man/imputation.Rd b/man/imputation.Rd
new file mode 100644
index 0000000..29ceb45
--- /dev/null
+++ b/man/imputation.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/imputation.R
+\name{imputation}
+\alias{imputation}
+\title{Imputing Missing Data}
+\usage{
+imputation(fit_data, fill_data, method, constant = NULL)
+}
+\arguments{
+\item{fit_data}{list}
+
+\item{fill_data}{list}
+
+\item{method}{character}
+
+\item{constant}{character, default NULL}
+}
+\value{
+list
+}
+\description{
+This function will impute missing data in a tibble/dataframe given the chosen method(mean, median)
+}
+\examples{
+test_df <- data.frame('a' = c(1,NA,3), 'b' = c(5,6,NA), 'c' = c(NA,1,10))
+test_df_imputed <- imputation(test_df, test_df, 'mean')
+}
diff --git a/man/scaler.Rd b/man/scaler.Rd
new file mode 100644
index 0000000..60e5b69
--- /dev/null
+++ b/man/scaler.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/scaler.R
+\name{scaler}
+\alias{scaler}
+\title{Scaling a dataframe}
+\usage{
+scaler(X_train, X_Valid, X_test, scale_features, scaler_type)
+}
+\arguments{
+\item{X_train}{data.frame}
+
+\item{X_Valid}{data.frame}
+
+\item{X_test}{data.frame}
+
+\item{scale_features}{character vector}
+
+\item{scaler_type}{character}
+}
+\value{
+data.frame of data.frames
+}
+\description{
+This function scales numerical features based on scaling requirement in a data.frame
+}
+\examples{
+X_train<- data.frame('name' = c('pandaman', 'doorman', 'eve'), 'age' = c(15,20,25), 'networth' = c(100000,100,100000))
+X_Valid<- data.frame('name' = c('pandaman_v', 'doorman_v', 'eve_v'), 'age' = c(15,56, 43) , 'networth' = c(123124, 352334,645645)
+X_test <- data.frame('name' = c('pandaman_t', 'doorman_t', 'eve_t'), 'age' =c(14,15,56), 'networth' = c(123124,90914, 124124)
+scaled_df <- scaler(X_train, X_Valid, X_test, c('age','networth'), scaler_type='standardscaler')
+}
diff --git a/tests/testthat.R b/tests/testthat.R
new file mode 100644
index 0000000..82afbe4
--- /dev/null
+++ b/tests/testthat.R
@@ -0,0 +1,4 @@
+library(testthat)
+library(prepropyr)
+
+test_check("prepropyr")
diff --git a/tests/testthat/test-imputation.R b/tests/testthat/test-imputation.R
new file mode 100644
index 0000000..df8517b
--- /dev/null
+++ b/tests/testthat/test-imputation.R
@@ -0,0 +1,36 @@
+df_1 <- data.frame('a' = c(NA, 4, 10), 'b' = c(2, NA, 5), 'c' = c(3,6,9))
+df_1_t <- data.frame('a' = c(7, 4, 10), 'b' = c(2, 3.5, 5), 'c' = c(3,6,9))
+df_2 <- data.frame('a' = c(NA, 4, 10, 3), 'b' = c(2, NA, 5, 15), 'c' = c(3, 6, 9, 17))
+df_2_t <- data.frame('a' = c(4.0, 4, 10, 3), 'b' = c(2, 5.0, 5, 15), 'c' = c(3, 6, 9, 17))
+df_3 <- data.frame('a' = c(NA, 'd', 'd'), 'b' = c('b', NA, 'x'), 'c' = c('c','f',NA))
+df_3_t <- data.frame('a' = c('test', 'd', 'd'), 'b' = c('b','test', 'x'), 'c' = c('c','f','test'))
+df_m <- data.frame('a' = c(NA, 4, 10), 'b' = c('a', NA, 'b'), 'c' = c(3,6,9))
+df_m_t <- data.frame('a' = c('test', 4, 10), 'b' = c('a', 'test', 'b'), 'c' = c(3,6,9))
+
+test_that("imputation returns output as expected given different methods", {
+  # test mean
+  expect_equal(imputation(df_1, df_1, 'mean'), df_1_t)
+  # test median
+  expect_equal(imputation(df_2, df_2, 'median'), df_2_t)
+  # test for constant
+  expect_equal(imputation(df_3, df_3, 'constant', 'test'), df_3_t)
+  # test for mean imputation when no NAN values
+  expect_equal(imputation(df_1_t, df_1_t, 'mean'), df_1_t)
+  # test for constant imputation when dataframe has mixed data type columns
+  expect_equal(imputation(df_m, df_m, 'constant', 'test'), df_m_t)
+  # test for dataframe when there is only a single value
+  expect_equal(imputation(data.frame('a' = c(1)), data.frame('a' = c(1)), 'mean'), data.frame('a' = c(1)))
+})
+
+test_that('tests whether exceptions will raise as expected', {
+  # test for entering non data.frame() into the function as fit_data
+  expect_error(imputation(1,df_1, 'mean'))
+  # test for entering non data.frame() into the function as fill_data
+  expect_error(imputation(df_1,1, 'mean'))
+  # test for using non existing method
+  expect_error(imputation(df_1, df_1, 'test'))
+  # test for imputation when number of columns of fit_data doesn't equal to fill_data
+  expect_error(imputation(df_1, data.frame('a' = c(1)), 'mean'))
+  # test for whether fit data is all numeric when mean/median is selected as method
+  expect_error(imputation(df_3, df_3, 'mean'))
+})