update the scan_date() function - only scan through character columns

epiverse-trace · Aug 1, 2024 · 272cb8b · 272cb8b
1 parent 0d11788
commit 272cb8b
Show file tree

Hide file tree

Showing 10 changed files with 151 additions and 295 deletions.
diff --git a/R/clean_data_helpers.R b/R/clean_data_helpers.R
@@ -1,222 +1,147 @@
-
-#' Calculate the percentage of missing and other data type values in a vector
-#' containing different data types such as numeric, Date, character,
-#' logical, date-time, factor.
-#'
-#' @param x A vector of ones or a combination of various data types.
-#' @param type A character with the the vector type.
-#'
-#' @returns A vector of 5 elements representing the percentage of missing,
-#' numeric, date, character, and logical values found in the input vector.
-#'
-#' @keywords internal
-#'
-scan_columns <- function(x, type) {
- res <- switch(type,
- double = scan_in_double(x),
- integer = scan_in_integer(x),
- logical = scan_in_logical(x),
- character = scan_in_character(x))
- return(res)
-}
-
-#' Scan a data frame to determine the percentage of `missing`, `numeric`,
-#' `Date`, `character`, `logical`, `date-time`, and `factor` values in every
-#' column.
+#' Scan through all character columns of a data frame to determine the
+#' proportion of `missing`, `numeric`, `Date`, `character`, `logical`, values.
 #'
 #' @param data A data frame or linelist
 #'
-#' @returns A data frame or linelist with the same number of rows as the number
-#' of columns of the input data, and 8 column representing the field names,
-#' the percentage of missing, numeric, date, character, logical, date-time,
-#' and factor values in each column.
+#' @returns A data frame with the same number of rows as the number of character
+#' columns of the input data, and six (06) columns representing the field
+#' names, the percentage of missing, numeric, date, character, and logical,
+#' values in each column.
 #'
 #' @export
 #'
 #' @examples
+#' # scan through a data frame of characters
 #' scan_result <- scan_data(
 #' data = readRDS(system.file("extdata", "messy_data.RDS",
 #' package = "cleanepi"))
 #' )
 #'
-#' @details
-#' For columns of type character, the detected numeric values could actually be
-#' of type Date or date-time. This is because R coerces some Date values into
-#' numeric when the date is imported from an MS Excel file.
+#' # scan through a data frame with two character columns
+#' scan_result <- scan_data(
+#' data = readRDS(system.file("extdata", "test_linelist.RDS",
+#' package = "cleanepi"))
+#' )
+#'
+#' # scan through a data frame with no character columns
+#' data(iris)
+#' iris[["fct"]] <- as.factor(sample(c("gray", "orange"), nrow(iris),
+#' replace = TRUE))
+#' iris[["lgl"]] <- sample(c(TRUE, FALSE), nrow(iris), replace = TRUE)
+#' iris[["date"]] <- as.Date(seq.Date(from = as.Date("2024-01-01"),
+#' to = as.Date("2024-08-30"),
+#' length.out = nrow(iris)))
+#' iris[["posit_ct"]] <- as.POSIXct(iris[["date"]])
+#' scan_result <- scan_data(data = iris)
 #'
 scan_data <- function(data) {
- types <- vapply(data, typeof, character(1L))
+ # scan through all columns of the data and the identify character columns
+ types <- vapply(data, typeof, character(1L))
+ target_columns <- types[types == "character"]
+
+ # send an message if there is no character column found within the input data
+ if (length(target_columns) == 0L) {
+ message("No character column found in the provided data.")
+ return(invisible(NA))
+ }
+
+ # scan through the character columns
+ data <- data[, names(target_columns)]
  scan_result <- vapply(seq_len(ncol(data)), function(col_index) {
- scan_columns(data[[col_index]], types[[col_index]])
- }, numeric(7L))
+ scan_in_character(data[[col_index]])
+ }, numeric(5L))
  scan_result <- as.data.frame(t(scan_result))
- names(scan_result) <- c("missing", "numeric", "date", "character",
- "logical", "date-time", "factor")
+ names(scan_result) <- c("missing", "numeric", "date", "character", "logical")
  scan_result <- cbind(Field_names = names(data), scan_result)
  return(scan_result)
 }
 
-#' Scan through a double column
-#'
-#' @param x The input vector
-#'
-#' @return A numeric vector with the proportion of the different types of data
-#' that were detected within the input vector.
-#' @keywords internal
-#'
-scan_in_double <- function(x) {
- are_factor <- are_date <- are_date_time <- are_character <- are_numeric <-
- are_logical <- are_na <- 0.0
- # save the variable length
- n_rows <- length(x)
-
- # get the proportion of NA
- are_na <- round((sum(is.na(x)) / n_rows), 6L)
- x <- x[!is.na(x)]
-
- # doubles are either numeric (attributes = NULL), or Date (has a 'class'
- # attributes = Date), or date-time (has a 'class' attributes = POSIXt)
- if ("class" %in% names(attributes(x))) {
- if ("Date" %in% attributes(x)[["class"]]) {
- are_date <- round((length(x) / n_rows), 6L)
- } else if ("POSIXt" %in% attributes(x)[["class"]]) {
- are_date_time <- round((length(x) / n_rows), 6L)
- }
- } else {
- are_numeric <- round((length(x) / n_rows), 6L)
- }
- return(
- c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time,
- are_factor)
- )
-}
-
-#' Scan through an integer column
-#'
-#' @param x The input vector
-#'
-#' @return A numeric vector with the proportion of the different types of data
-#' that were detected within the input vector.
-#' @keywords internal
-#'
-scan_in_integer <- function(x) {
- are_factor <- are_date <- are_date_time <- are_character <- are_numeric <-
- are_logical <- are_na <- 0.0
- # save the variable length
- n_rows <- length(x)
-
- # get the proportion of NA
- are_na <- round((sum(is.na(x)) / n_rows), 6L)
- x <- x[!is.na(x)]
-
- # integers are either numeric (attributes = NULL), or factors (has a 'class'
- # and 'levels' attributes)
- if (is.null(attributes(x))) {
- are_numeric <- round((length(x) / n_rows), 6L)
- } else if (identical(names(attributes(x)), c("levels", "class"))) {
- are_factor <- round((length(x) / n_rows), 6L)
- }
-
- return(
- c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time,
- are_factor)
- )
-}
-
-#' Scan through a logical column
-#'
-#' @param x The input vector
-#'
-#' @return A numeric vector with the proportion of the different types of data
-#' that were detected within the input vector.
-#' @keywords internal
-#'
-scan_in_logical <- function(x) {
- are_factor <- are_date <- are_date_time <- are_character <- are_numeric <-
- are_logical <- are_na <- 0.0
-
- # logical are simply logical. We will only determine the %NA and %logical
- # save the variable length
- n_rows <- length(x)
-
- # get the proportion of NA
- are_na <- round((sum(is.na(x)) / n_rows), 6L)
- x <- x[!is.na(x)]
-
- # get the proportion of logical
- are_logical <- round((length(x) / n_rows), 6L)
- return(
- c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time,
- are_factor)
- )
-}
-
 #' Scan through a character column
 #'
-#' @param x The input vector
+#' @param x The input character vector
 #'
 #' @return A numeric vector with the proportion of the different types of data
 #' that were detected within the input vector.
 #' @keywords internal
 #'
 scan_in_character <- function(x) {
  # There might be, within a character column, values of type:
- # character, numeric, date, date-time, NA, and logical
+ # character, numeric, date (date or date-time), NA, and logical
  # In this function, we check the presence of these different types within a
  # character column.
- # Note that numeric values can actually be of 'Date' or 'date-time' type.
- # Given that any numeric can be converted into Date, we will not check for
- # Date or date-time values within the numeric.
-
- are_factor <- are_date <- are_date_time <- are_character <- are_numeric <-
- are_logical <- are_na <- 0.0
 
  # save the variable length
- n_rows <- length(x)
+ initial_length <- length(x)
 
- # get the proportion of NA
- are_na <- round((sum(is.na(x)) / n_rows), 6L)
+ # get the count of missing data (NA)
+ na_count <- sum(is.na(x))
  x <- x[!is.na(x)]
 
- # get double values and evaluate the proportion numeric values
- doubles <- x[!is.na(suppressWarnings(as.double(x)))]
- if (length(doubles) > 0L) {
-  are_numeric <- round((length(doubles) / n_rows), 6L)
- }
-
- # get character values and check for the presence of Date and date-time
- characters <- x[is.na(suppressWarnings(as.double(x)))]
- if (length(characters) > 0L &&
-  !is.null(lubridate::guess_formats(characters,
-  c("ymd", "ydm", "dmy", "mdy", "myd",
-  "dym", "Ymd", "Ydm", "dmY", "mdY",
-  "mYd", "dYm")))) {
- # get the proportion of date values
-  tmp <- suppressWarnings(
-  as.Date(
-  lubridate::parse_date_time(
-  characters,
-  orders = c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm",
-  "dmY", "mdY", "mYd", "dYm")
- )
+ # We will check if there is any Date values within the variable by parsing the
+ # values, looking for the ones that fit any of the predefined format.
+ # When there is one or more Date values, we will convert the variable into
+ # numeric and determine if any of them is a Date (a numeric, which after
+ # conversion to Date, fall within the interval
+ # [50 years back from today's date, today's date]). That way the Date count is
+ # the count of date identified from the parsing + the count of Dates within
+ # the numeric values.
+ # When there is no Date values identified from the parsing, the variable
+ # is converted into numeric. The numeric count is the sum of numeric values.
+ # The logical count is the number of TRUE and FALSE written in both lower
+ # and upper case within the variable
+ # The remaining values will be of type character.
+
+ # parsing the vector, looking for date values
+ tmp <- suppressWarnings(
+ as.Date(
+ lubridate::parse_date_time(
+ x,
+ orders = c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm",
+  "dmY", "mdY", "mYd", "dYm")
  )
  )
- are_date <- round((sum(!is.na(tmp)) / n_rows), 6L)
- x <- x[is.na(tmp)]
- characters <- characters[is.na(tmp)]
+ )
+
+ # getting the date and numeric count as describe above
+ date_count <- numeric_count <- 0L
+ if (sum(!is.na(tmp)) > 0L) {
+ # Setting the first date to 50 years before the current date
+ target_interval <- sort(
+ seq.Date(Sys.Date(), length.out = 2L, by = "-50 years")
+ )
+
+ # get the date count
+ date_count <- date_count + sum(!is.na(tmp))
+
+ # convert to numeric and check for the presence of Date among the numeric
+ tmp2 <- x[is.na(tmp)]
+ tmp3 <- suppressWarnings(as.numeric(tmp2))
+ if (sum(!is.na(tmp3)) > 0L) {
+ y <- lubridate::as_date(
+ tmp3[!is.na(tmp3)],
+ origin = target_interval[[1L]]
+ )
+ # second count of date values coming from date within numeric
+ date_count <- date_count + sum(!is.na(y))
+ numeric_count <- sum(is.na(y))
+ }
+ } else {
+ tmp <- suppressWarnings(as.numeric(x))
+ numeric_count <- sum(!is.na(tmp))
  }
 
- # get the proportion of logical values
- logicals <- toupper(characters) == "TRUE" | toupper(characters) == "FALSE"
- are_logical <- round((sum(logicals) / n_rows), 6L)
+ # get logical count
+ logicals <- toupper(x) == "TRUE" | toupper(x) == "FALSE"
+ logical_count <- sum(logicals)
 
- # get the proportion of character values
- are_character <- round((1.0 - (are_na + are_numeric +
-  are_date + are_logical)), 6L)
+ # get the character count
+ character_count <- initial_length -
+ (na_count + logical_count + numeric_count + date_count)
 
- # return the output
- return(
- c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time,
- are_factor)
- )
+ # transform into proportions
+ props <- round(
+ c(na_count, numeric_count, date_count, character_count, logical_count) /
+ initial_length, 4L)
+
+ return(props)
 }
diff --git a/inst/rmarkdown/templates/printing-rmd/skeleton/skeleton.Rmd b/inst/rmarkdown/templates/printing-rmd/skeleton/skeleton.Rmd
@@ -59,7 +59,7 @@ p.compact {
 ```{r cleanepi-source-data, eval=TRUE, echo=FALSE}
 # EXTRACT THE REPORT SECTIONS
 scanning_result <- params[["scanning_result"]]
-is_data_scanned <- !is.null(scanning_result)
+is_data_scanned <- !is.na(scanning_result)
 standardized_column_names <- params[["colnames"]]
 are_column_standardised <- !is.null(standardized_column_names)
 out_of_range_dates <- params[["out_of_range_dates"]]

diff --git a/man/scan_columns.Rd b/man/scan_columns.Rd