From 272cb8bfe9522da8caa5639776e8911b77eeb90c Mon Sep 17 00:00:00 2001
From: Karim-Mane <karimanee@outlook.com>
Date: Thu, 1 Aug 2024 00:38:19 +0000
Subject: [PATCH] update the scan_date() function - only scan through character
 columns

---
 R/clean_data_helpers.R                        | 283 +++++++-----------
 .../printing-rmd/skeleton/skeleton.Rmd        |   2 +-
 man/scan_columns.Rd                           |  25 --
 man/scan_data.Rd                              |  41 ++-
 man/scan_in_character.Rd                      |   2 +-
 man/scan_in_double.Rd                         |  19 --
 man/scan_in_integer.Rd                        |  19 --
 man/scan_in_logical.Rd                        |  19 --
 tests/testthat/test-clean_data_helpers.R      |  32 +-
 tests/testthat/test-utils.R                   |   4 +-
 10 files changed, 151 insertions(+), 295 deletions(-)
 delete mode 100644 man/scan_columns.Rd
 delete mode 100644 man/scan_in_double.Rd
 delete mode 100644 man/scan_in_integer.Rd
 delete mode 100644 man/scan_in_logical.Rd

diff --git a/R/clean_data_helpers.R b/R/clean_data_helpers.R
index b86a4225..8e0bd5bd 100644
--- a/R/clean_data_helpers.R
+++ b/R/clean_data_helpers.R
@@ -1,159 +1,64 @@
-
-#' Calculate the percentage of missing and other data type values in a vector
-#' containing different data types such as numeric, Date, character,
-#' logical, date-time, factor.
-#'
-#' @param x A vector of ones or a combination of various data types.
-#' @param type A character with the the vector type.
-#'
-#' @returns A vector of 5 elements representing the percentage of missing,
-#' numeric, date, character, and logical values found in the input vector.
-#'
-#' @keywords internal
-#'
-scan_columns <- function(x, type) {
-  res <- switch(type,
-                double = scan_in_double(x),
-                integer = scan_in_integer(x),
-                logical = scan_in_logical(x),
-                character = scan_in_character(x))
-  return(res)
-}
-
-#' Scan a data frame to determine the percentage of `missing`, `numeric`,
-#'    `Date`, `character`, `logical`, `date-time`, and `factor` values in every
-#'    column.
+#' Scan through all character columns of a data frame to determine the
+#' proportion of `missing`, `numeric`, `Date`, `character`, `logical`, values.
 #'
 #' @param data A data frame or linelist
 #'
-#' @returns A data frame or linelist with the same number of rows as the number
-#'    of columns of the input data, and 8 column representing the field names,
-#'    the percentage of missing, numeric, date, character, logical, date-time,
-#'    and factor values in each column.
+#' @returns A data frame with the same number of rows as the number of character
+#'    columns of the input data, and six (06) columns representing the field
+#'    names, the percentage of missing, numeric, date, character, and logical,
+#'    values in each column.
 #'
 #' @export
 #'
 #' @examples
+#' # scan through a data frame of characters
 #' scan_result <- scan_data(
 #'   data = readRDS(system.file("extdata", "messy_data.RDS",
 #'                              package = "cleanepi"))
 #' )
 #'
-#' @details
-#' For columns of type character, the detected numeric values could actually be
-#' of type Date or date-time. This is because R coerces some Date values into
-#' numeric when the date is imported from an MS Excel file.
+#' # scan through a data frame with two character columns
+#' scan_result <- scan_data(
+#'   data = readRDS(system.file("extdata", "test_linelist.RDS",
+#'                              package = "cleanepi"))
+#'   )
+#'
+#' # scan through a data frame with no character columns
+#' data(iris)
+#' iris[["fct"]] <- as.factor(sample(c("gray", "orange"), nrow(iris),
+#'                            replace = TRUE))
+#' iris[["lgl"]]  <- sample(c(TRUE, FALSE), nrow(iris), replace = TRUE)
+#' iris[["date"]] <- as.Date(seq.Date(from = as.Date("2024-01-01"),
+#'                                    to = as.Date("2024-08-30"),
+#'                                    length.out = nrow(iris)))
+#' iris[["posit_ct"]] <- as.POSIXct(iris[["date"]])
+#' scan_result        <- scan_data(data = iris)
 #'
 scan_data <- function(data) {
-  types       <- vapply(data, typeof, character(1L))
+  # scan through all columns of the data and the identify character columns
+  types <- vapply(data, typeof, character(1L))
+  target_columns <- types[types == "character"]
+
+  # send an message if there is no character column found within the input data
+  if (length(target_columns) == 0L) {
+    message("No character column found in the provided data.")
+    return(invisible(NA))
+  }
+
+  # scan through the character columns
+  data <- data[, names(target_columns)]
   scan_result <- vapply(seq_len(ncol(data)), function(col_index) {
-    scan_columns(data[[col_index]], types[[col_index]])
-  }, numeric(7L))
+    scan_in_character(data[[col_index]])
+  }, numeric(5L))
   scan_result        <- as.data.frame(t(scan_result))
-  names(scan_result) <- c("missing", "numeric", "date", "character",
-                          "logical", "date-time", "factor")
+  names(scan_result) <- c("missing", "numeric", "date", "character", "logical")
   scan_result        <- cbind(Field_names = names(data), scan_result)
   return(scan_result)
 }
 
-#' Scan through a double column
-#'
-#' @param x The input vector
-#'
-#' @return A numeric vector with the proportion of the different types of data
-#'    that were detected within the input vector.
-#' @keywords internal
-#'
-scan_in_double <- function(x) {
-  are_factor <- are_date <- are_date_time <- are_character <- are_numeric <-
-    are_logical <- are_na <- 0.0
-  # save the variable length
-  n_rows <- length(x)
-
-  # get the proportion of NA
-  are_na <- round((sum(is.na(x)) / n_rows), 6L)
-  x      <- x[!is.na(x)]
-
-  # doubles are either numeric (attributes = NULL), or Date (has a 'class'
-  # attributes = Date), or date-time (has a 'class' attributes = POSIXt)
-  if ("class" %in% names(attributes(x))) {
-    if ("Date" %in% attributes(x)[["class"]]) {
-      are_date <- round((length(x) / n_rows), 6L)
-    } else if ("POSIXt" %in% attributes(x)[["class"]]) {
-      are_date_time <- round((length(x) / n_rows), 6L)
-    }
-  } else {
-    are_numeric <- round((length(x) / n_rows), 6L)
-  }
-  return(
-    c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time,
-      are_factor)
-  )
-}
-
-#' Scan through an integer column
-#'
-#' @param x The input vector
-#'
-#' @return A numeric vector with the proportion of the different types of data
-#'    that were detected within the input vector.
-#' @keywords internal
-#'
-scan_in_integer <- function(x) {
-  are_factor <- are_date <- are_date_time <- are_character <- are_numeric <-
-    are_logical <- are_na <- 0.0
-  # save the variable length
-  n_rows <- length(x)
-
-  # get the proportion of NA
-  are_na <- round((sum(is.na(x)) / n_rows), 6L)
-  x      <- x[!is.na(x)]
-
-  # integers are either numeric (attributes = NULL), or factors (has a 'class'
-  # and 'levels' attributes)
-  if (is.null(attributes(x))) {
-    are_numeric <- round((length(x) / n_rows), 6L)
-  } else if (identical(names(attributes(x)), c("levels", "class"))) {
-    are_factor <- round((length(x) / n_rows), 6L)
-  }
-
-  return(
-    c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time,
-      are_factor)
-  )
-}
-
-#' Scan through a logical column
-#'
-#' @param x The input vector
-#'
-#' @return A numeric vector with the proportion of the different types of data
-#'    that were detected within the input vector.
-#' @keywords internal
-#'
-scan_in_logical <- function(x) {
-  are_factor <- are_date <- are_date_time <- are_character <- are_numeric <-
-    are_logical <- are_na <- 0.0
-
-  # logical are simply logical. We will only determine the %NA and %logical
-  # save the variable length
-  n_rows <- length(x)
-
-  # get the proportion of NA
-  are_na <- round((sum(is.na(x)) / n_rows), 6L)
-  x      <- x[!is.na(x)]
-
-  # get the proportion of logical
-  are_logical <- round((length(x) / n_rows), 6L)
-  return(
-    c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time,
-      are_factor)
-  )
-}
-
 #' Scan through a character column
 #'
-#' @param x The input vector
+#' @param x The input character vector
 #'
 #' @return A numeric vector with the proportion of the different types of data
 #'    that were detected within the input vector.
@@ -161,62 +66,82 @@ scan_in_logical <- function(x) {
 #'
 scan_in_character <- function(x) {
   # There might be, within a character column, values of type:
-  # character, numeric, date, date-time, NA, and logical
+  # character, numeric, date (date or date-time), NA, and logical
   # In this function, we check the presence of these different types within a
   # character column.
-  # Note that numeric values can actually be of 'Date' or 'date-time' type.
-  # Given that any numeric can be converted into Date, we will not check for
-  # Date or date-time values within the numeric.
-
-  are_factor <- are_date <- are_date_time <- are_character <- are_numeric <-
-    are_logical <- are_na <- 0.0
 
   # save the variable length
-  n_rows <- length(x)
+  initial_length <- length(x)
 
-  # get the proportion of NA
-  are_na <- round((sum(is.na(x)) / n_rows), 6L)
+  # get the count of missing data (NA)
+  na_count <- sum(is.na(x))
   x      <- x[!is.na(x)]
 
-  # get double values and evaluate the proportion numeric values
-  doubles <- x[!is.na(suppressWarnings(as.double(x)))]
-  if (length(doubles) > 0L) {
-    are_numeric <- round((length(doubles) / n_rows), 6L)
-  }
-
-  # get character values and check for the presence of Date and date-time
-  characters <- x[is.na(suppressWarnings(as.double(x)))]
-  if (length(characters) > 0L &&
-      !is.null(lubridate::guess_formats(characters,
-                                        c("ymd", "ydm", "dmy", "mdy", "myd",
-                                          "dym", "Ymd", "Ydm", "dmY", "mdY",
-                                          "mYd", "dYm")))) {
-    # get the proportion of date values
-    tmp    <- suppressWarnings(
-      as.Date(
-        lubridate::parse_date_time(
-          characters,
-          orders = c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm",
-                     "dmY", "mdY", "mYd", "dYm")
-        )
+  # We will check if there is any Date values within the variable by parsing the
+  # values, looking for the ones that fit any of the predefined format.
+  #     When there is one or more Date values, we will convert the variable into
+  # numeric and determine if any of them is a Date (a numeric, which after
+  # conversion to Date, fall within the interval
+  # [50 years back from today's date, today's date]). That way the Date count is
+  # the count of date identified from the parsing + the count of Dates within
+  # the numeric values.
+  #     When there is no Date values identified from the parsing, the variable
+  # is converted into numeric. The numeric count is the sum of numeric values.
+  #     The logical count is the number of TRUE and FALSE written in both lower
+  # and upper case within the variable
+  #     The remaining values will be of type character.
+
+  # parsing the vector, looking for date values
+  tmp <- suppressWarnings(
+    as.Date(
+      lubridate::parse_date_time(
+        x,
+        orders = c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm",
+                   "dmY", "mdY", "mYd", "dYm")
       )
     )
-    are_date   <- round((sum(!is.na(tmp)) / n_rows), 6L)
-    x          <- x[is.na(tmp)]
-    characters <- characters[is.na(tmp)]
+  )
+
+  # getting the date and numeric count as describe above
+  date_count <- numeric_count <- 0L
+  if (sum(!is.na(tmp)) > 0L) {
+    # Setting the first date to 50 years before the current date
+    target_interval <- sort(
+      seq.Date(Sys.Date(), length.out = 2L, by = "-50 years")
+    )
+
+    # get the date count
+    date_count <- date_count + sum(!is.na(tmp))
+
+    # convert to numeric and check for the presence of Date among the numeric
+    tmp2 <- x[is.na(tmp)]
+    tmp3 <- suppressWarnings(as.numeric(tmp2))
+    if (sum(!is.na(tmp3)) > 0L) {
+      y <- lubridate::as_date(
+        tmp3[!is.na(tmp3)],
+        origin = target_interval[[1L]]
+      )
+      # second count of date values coming from date within numeric
+      date_count <- date_count + sum(!is.na(y))
+      numeric_count <- sum(is.na(y))
+    }
+  } else {
+    tmp <- suppressWarnings(as.numeric(x))
+    numeric_count <- sum(!is.na(tmp))
   }
 
-  # get the proportion of logical values
-  logicals <- toupper(characters) == "TRUE" | toupper(characters) == "FALSE"
-  are_logical <- round((sum(logicals) / n_rows), 6L)
+  # get logical count
+  logicals <- toupper(x) == "TRUE" | toupper(x) == "FALSE"
+  logical_count <- sum(logicals)
 
-  # get the proportion of character values
-  are_character <- round((1.0 - (are_na + are_numeric +
-                                   are_date + are_logical)), 6L)
+  # get the character count
+  character_count <- initial_length -
+    (na_count + logical_count + numeric_count + date_count)
 
-  # return the output
-  return(
-    c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time,
-      are_factor)
-  )
+  # transform into proportions
+  props <- round(
+    c(na_count, numeric_count, date_count, character_count, logical_count) /
+      initial_length, 4L)
+
+  return(props)
 }
diff --git a/inst/rmarkdown/templates/printing-rmd/skeleton/skeleton.Rmd b/inst/rmarkdown/templates/printing-rmd/skeleton/skeleton.Rmd
index 837d556c..e13e407b 100644
--- a/inst/rmarkdown/templates/printing-rmd/skeleton/skeleton.Rmd
+++ b/inst/rmarkdown/templates/printing-rmd/skeleton/skeleton.Rmd
@@ -59,7 +59,7 @@ p.compact {
 ```{r cleanepi-source-data, eval=TRUE, echo=FALSE}
 # EXTRACT THE REPORT SECTIONS
 scanning_result <- params[["scanning_result"]]
-is_data_scanned <- !is.null(scanning_result)
+is_data_scanned <- !is.na(scanning_result)
 standardized_column_names <- params[["colnames"]]
 are_column_standardised   <- !is.null(standardized_column_names)
 out_of_range_dates           <- params[["out_of_range_dates"]]
diff --git a/man/scan_columns.Rd b/man/scan_columns.Rd
deleted file mode 100644
index b359875d..00000000
--- a/man/scan_columns.Rd
+++ /dev/null
@@ -1,25 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/clean_data_helpers.R
-\name{scan_columns}
-\alias{scan_columns}
-\title{Calculate the percentage of missing and other data type values in a vector
-containing different data types such as numeric, Date, character,
-logical, date-time, factor.}
-\usage{
-scan_columns(x, type)
-}
-\arguments{
-\item{x}{A vector of ones or a combination of various data types.}
-
-\item{type}{A character with the the vector type.}
-}
-\value{
-A vector of 5 elements representing the percentage of missing,
-numeric, date, character, and logical values found in the input vector.
-}
-\description{
-Calculate the percentage of missing and other data type values in a vector
-containing different data types such as numeric, Date, character,
-logical, date-time, factor.
-}
-\keyword{internal}
diff --git a/man/scan_data.Rd b/man/scan_data.Rd
index cd6b62dd..90a2ee21 100644
--- a/man/scan_data.Rd
+++ b/man/scan_data.Rd
@@ -2,9 +2,8 @@
 % Please edit documentation in R/clean_data_helpers.R
 \name{scan_data}
 \alias{scan_data}
-\title{Scan a data frame to determine the percentage of \code{missing}, \code{numeric},
-\code{Date}, \code{character}, \code{logical}, \code{date-time}, and \code{factor} values in every
-column.}
+\title{Scan through all character columns of a data frame to determine the
+proportion of \code{missing}, \code{numeric}, \code{Date}, \code{character}, \code{logical}, values.}
 \usage{
 scan_data(data)
 }
@@ -12,25 +11,37 @@ scan_data(data)
 \item{data}{A data frame or linelist}
 }
 \value{
-A data frame or linelist with the same number of rows as the number
-of columns of the input data, and 8 column representing the field names,
-the percentage of missing, numeric, date, character, logical, date-time,
-and factor values in each column.
+A data frame with the same number of rows as the number of character
+columns of the input data, and six (06) columns representing the field
+names, the percentage of missing, numeric, date, character, and logical,
+values in each column.
 }
 \description{
-Scan a data frame to determine the percentage of \code{missing}, \code{numeric},
-\code{Date}, \code{character}, \code{logical}, \code{date-time}, and \code{factor} values in every
-column.
-}
-\details{
-For columns of type character, the detected numeric values could actually be
-of type Date or date-time. This is because R coerces some Date values into
-numeric when the date is imported from an MS Excel file.
+Scan through all character columns of a data frame to determine the
+proportion of \code{missing}, \code{numeric}, \code{Date}, \code{character}, \code{logical}, values.
 }
 \examples{
+# scan through a data frame of characters
 scan_result <- scan_data(
   data = readRDS(system.file("extdata", "messy_data.RDS",
                              package = "cleanepi"))
 )
 
+# scan through a data frame with two character columns
+scan_result <- scan_data(
+  data = readRDS(system.file("extdata", "test_linelist.RDS",
+                             package = "cleanepi"))
+  )
+
+# scan through a data frame with no character columns
+data(iris)
+iris[["fct"]] <- as.factor(sample(c("gray", "orange"), nrow(iris),
+                           replace = TRUE))
+iris[["lgl"]]  <- sample(c(TRUE, FALSE), nrow(iris), replace = TRUE)
+iris[["date"]] <- as.Date(seq.Date(from = as.Date("2024-01-01"),
+                                   to = as.Date("2024-08-30"),
+                                   length.out = nrow(iris)))
+iris[["posit_ct"]] <- as.POSIXct(iris[["date"]])
+scan_result        <- scan_data(data = iris)
+
 }
diff --git a/man/scan_in_character.Rd b/man/scan_in_character.Rd
index e1e912fc..83559951 100644
--- a/man/scan_in_character.Rd
+++ b/man/scan_in_character.Rd
@@ -7,7 +7,7 @@
 scan_in_character(x)
 }
 \arguments{
-\item{x}{The input vector}
+\item{x}{The input character vector}
 }
 \value{
 A numeric vector with the proportion of the different types of data
diff --git a/man/scan_in_double.Rd b/man/scan_in_double.Rd
deleted file mode 100644
index 3c277218..00000000
--- a/man/scan_in_double.Rd
+++ /dev/null
@@ -1,19 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/clean_data_helpers.R
-\name{scan_in_double}
-\alias{scan_in_double}
-\title{Scan through a double column}
-\usage{
-scan_in_double(x)
-}
-\arguments{
-\item{x}{The input vector}
-}
-\value{
-A numeric vector with the proportion of the different types of data
-that were detected within the input vector.
-}
-\description{
-Scan through a double column
-}
-\keyword{internal}
diff --git a/man/scan_in_integer.Rd b/man/scan_in_integer.Rd
deleted file mode 100644
index d807eed6..00000000
--- a/man/scan_in_integer.Rd
+++ /dev/null
@@ -1,19 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/clean_data_helpers.R
-\name{scan_in_integer}
-\alias{scan_in_integer}
-\title{Scan through an integer column}
-\usage{
-scan_in_integer(x)
-}
-\arguments{
-\item{x}{The input vector}
-}
-\value{
-A numeric vector with the proportion of the different types of data
-that were detected within the input vector.
-}
-\description{
-Scan through an integer column
-}
-\keyword{internal}
diff --git a/man/scan_in_logical.Rd b/man/scan_in_logical.Rd
deleted file mode 100644
index 61f22803..00000000
--- a/man/scan_in_logical.Rd
+++ /dev/null
@@ -1,19 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/clean_data_helpers.R
-\name{scan_in_logical}
-\alias{scan_in_logical}
-\title{Scan through a logical column}
-\usage{
-scan_in_logical(x)
-}
-\arguments{
-\item{x}{The input vector}
-}
-\value{
-A numeric vector with the proportion of the different types of data
-that were detected within the input vector.
-}
-\description{
-Scan through a logical column
-}
-\keyword{internal}
diff --git a/tests/testthat/test-clean_data_helpers.R b/tests/testthat/test-clean_data_helpers.R
index d504f0f7..880b2c9d 100644
--- a/tests/testthat/test-clean_data_helpers.R
+++ b/tests/testthat/test-clean_data_helpers.R
@@ -1,15 +1,15 @@
 test_that("scan_data works as expected", {
-  dat        <- readRDS(system.file("extdata", "messy_data.RDS",
-                                    package = "cleanepi"))
+  # using a dataset with character columns only
+  dat <- readRDS(system.file("extdata", "messy_data.RDS", package = "cleanepi"))
   scan_result <- scan_data(data = dat)
   expect_s3_class(scan_result, "data.frame")
   expect_named(scan_result, c("Field_names", "missing", "numeric", "date",
-                              "character", "logical", "date-time", "factor"))
-  expect_identical(ncol(scan_result), 8L)
+                              "character", "logical"))
+  expect_identical(ncol(scan_result), 6L)
   expect_identical(nrow(scan_result), ncol(dat))
   expect_identical(scan_result[["Field_names"]], names(dat))
 
-  # using a dataset with many data types
+  # using a dataset with no character column
   data(iris)
   iris[["fct"]]  <- as.factor(sample(c("gray", "orange"), nrow(iris),
                                     replace = TRUE))
@@ -19,14 +19,16 @@ test_that("scan_data works as expected", {
                                      length.out = nrow(iris)))
   iris[["posit_ct"]] <- as.POSIXct(iris[["date"]])
   scan_result        <- scan_data(data = iris)
-  expect_identical(ncol(scan_result), 8L)
-  expect_identical(nrow(scan_result), ncol(iris))
-  expect_identical(scan_result[["Field_names"]], names(iris))
-  expect_identical(sum(scan_result[["numeric"]]), 4)
-  expect_identical(sum(scan_result[["missing"]]), 0)
-  expect_identical(sum(scan_result[["date"]]), 1)
-  expect_identical(sum(scan_result[["character"]]), 0)
-  expect_identical(sum(scan_result[["logical"]]), 1)
-  expect_identical(sum(scan_result[["date-time"]]), 1)
-  expect_identical(sum(scan_result[["factor"]]), 2)
+  expect_identical(scan_result, NA)
+  expect_message(scan_data(data = iris),
+                 "No character column found in the provided data.")
+
+  # using a data with some character columns
+  dat <- readRDS(system.file("extdata", "test_linelist.RDS",
+                             package = "cleanepi"))
+  scan_result <- suppressWarnings(scan_data(data = dat))
+  expect_identical(ncol(scan_result), 6L)
+  expect_identical(nrow(scan_result), 2L)
+  expect_false(nrow(scan_result) == ncol(dat))
+  expect_identical(scan_result[["Field_names"]], c("id", "age_class"))
 })
diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R
index 1f188491..dafb13f5 100644
--- a/tests/testthat/test-utils.R
+++ b/tests/testthat/test-utils.R
@@ -112,8 +112,8 @@ test_that("add_to_report works as expected", {
   expect_named(report, "scanning_result")
   expect_named(report[["scanning_result"]],
                c("Field_names", "missing", "numeric", "date", "character",
-                 "logical", "date-time", "factor"))
-  expect_identical(nrow(report[["scanning_result"]]), ncol(data))
+                 "logical"))
+  expect_identical(nrow(report[["scanning_result"]]), 6L)
 })