From 272cb8bfe9522da8caa5639776e8911b77eeb90c Mon Sep 17 00:00:00 2001 From: Karim-Mane Date: Thu, 1 Aug 2024 00:38:19 +0000 Subject: [PATCH] update the scan_date() function - only scan through character columns --- R/clean_data_helpers.R | 283 +++++++----------- .../printing-rmd/skeleton/skeleton.Rmd | 2 +- man/scan_columns.Rd | 25 -- man/scan_data.Rd | 41 ++- man/scan_in_character.Rd | 2 +- man/scan_in_double.Rd | 19 -- man/scan_in_integer.Rd | 19 -- man/scan_in_logical.Rd | 19 -- tests/testthat/test-clean_data_helpers.R | 32 +- tests/testthat/test-utils.R | 4 +- 10 files changed, 151 insertions(+), 295 deletions(-) delete mode 100644 man/scan_columns.Rd delete mode 100644 man/scan_in_double.Rd delete mode 100644 man/scan_in_integer.Rd delete mode 100644 man/scan_in_logical.Rd diff --git a/R/clean_data_helpers.R b/R/clean_data_helpers.R index b86a4225..8e0bd5bd 100644 --- a/R/clean_data_helpers.R +++ b/R/clean_data_helpers.R @@ -1,159 +1,64 @@ - -#' Calculate the percentage of missing and other data type values in a vector -#' containing different data types such as numeric, Date, character, -#' logical, date-time, factor. -#' -#' @param x A vector of ones or a combination of various data types. -#' @param type A character with the the vector type. -#' -#' @returns A vector of 5 elements representing the percentage of missing, -#' numeric, date, character, and logical values found in the input vector. -#' -#' @keywords internal -#' -scan_columns <- function(x, type) { - res <- switch(type, - double = scan_in_double(x), - integer = scan_in_integer(x), - logical = scan_in_logical(x), - character = scan_in_character(x)) - return(res) -} - -#' Scan a data frame to determine the percentage of `missing`, `numeric`, -#' `Date`, `character`, `logical`, `date-time`, and `factor` values in every -#' column. +#' Scan through all character columns of a data frame to determine the +#' proportion of `missing`, `numeric`, `Date`, `character`, `logical`, values. #' #' @param data A data frame or linelist #' -#' @returns A data frame or linelist with the same number of rows as the number -#' of columns of the input data, and 8 column representing the field names, -#' the percentage of missing, numeric, date, character, logical, date-time, -#' and factor values in each column. +#' @returns A data frame with the same number of rows as the number of character +#' columns of the input data, and six (06) columns representing the field +#' names, the percentage of missing, numeric, date, character, and logical, +#' values in each column. #' #' @export #' #' @examples +#' # scan through a data frame of characters #' scan_result <- scan_data( #' data = readRDS(system.file("extdata", "messy_data.RDS", #' package = "cleanepi")) #' ) #' -#' @details -#' For columns of type character, the detected numeric values could actually be -#' of type Date or date-time. This is because R coerces some Date values into -#' numeric when the date is imported from an MS Excel file. +#' # scan through a data frame with two character columns +#' scan_result <- scan_data( +#' data = readRDS(system.file("extdata", "test_linelist.RDS", +#' package = "cleanepi")) +#' ) +#' +#' # scan through a data frame with no character columns +#' data(iris) +#' iris[["fct"]] <- as.factor(sample(c("gray", "orange"), nrow(iris), +#' replace = TRUE)) +#' iris[["lgl"]] <- sample(c(TRUE, FALSE), nrow(iris), replace = TRUE) +#' iris[["date"]] <- as.Date(seq.Date(from = as.Date("2024-01-01"), +#' to = as.Date("2024-08-30"), +#' length.out = nrow(iris))) +#' iris[["posit_ct"]] <- as.POSIXct(iris[["date"]]) +#' scan_result <- scan_data(data = iris) #' scan_data <- function(data) { - types <- vapply(data, typeof, character(1L)) + # scan through all columns of the data and the identify character columns + types <- vapply(data, typeof, character(1L)) + target_columns <- types[types == "character"] + + # send an message if there is no character column found within the input data + if (length(target_columns) == 0L) { + message("No character column found in the provided data.") + return(invisible(NA)) + } + + # scan through the character columns + data <- data[, names(target_columns)] scan_result <- vapply(seq_len(ncol(data)), function(col_index) { - scan_columns(data[[col_index]], types[[col_index]]) - }, numeric(7L)) + scan_in_character(data[[col_index]]) + }, numeric(5L)) scan_result <- as.data.frame(t(scan_result)) - names(scan_result) <- c("missing", "numeric", "date", "character", - "logical", "date-time", "factor") + names(scan_result) <- c("missing", "numeric", "date", "character", "logical") scan_result <- cbind(Field_names = names(data), scan_result) return(scan_result) } -#' Scan through a double column -#' -#' @param x The input vector -#' -#' @return A numeric vector with the proportion of the different types of data -#' that were detected within the input vector. -#' @keywords internal -#' -scan_in_double <- function(x) { - are_factor <- are_date <- are_date_time <- are_character <- are_numeric <- - are_logical <- are_na <- 0.0 - # save the variable length - n_rows <- length(x) - - # get the proportion of NA - are_na <- round((sum(is.na(x)) / n_rows), 6L) - x <- x[!is.na(x)] - - # doubles are either numeric (attributes = NULL), or Date (has a 'class' - # attributes = Date), or date-time (has a 'class' attributes = POSIXt) - if ("class" %in% names(attributes(x))) { - if ("Date" %in% attributes(x)[["class"]]) { - are_date <- round((length(x) / n_rows), 6L) - } else if ("POSIXt" %in% attributes(x)[["class"]]) { - are_date_time <- round((length(x) / n_rows), 6L) - } - } else { - are_numeric <- round((length(x) / n_rows), 6L) - } - return( - c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time, - are_factor) - ) -} - -#' Scan through an integer column -#' -#' @param x The input vector -#' -#' @return A numeric vector with the proportion of the different types of data -#' that were detected within the input vector. -#' @keywords internal -#' -scan_in_integer <- function(x) { - are_factor <- are_date <- are_date_time <- are_character <- are_numeric <- - are_logical <- are_na <- 0.0 - # save the variable length - n_rows <- length(x) - - # get the proportion of NA - are_na <- round((sum(is.na(x)) / n_rows), 6L) - x <- x[!is.na(x)] - - # integers are either numeric (attributes = NULL), or factors (has a 'class' - # and 'levels' attributes) - if (is.null(attributes(x))) { - are_numeric <- round((length(x) / n_rows), 6L) - } else if (identical(names(attributes(x)), c("levels", "class"))) { - are_factor <- round((length(x) / n_rows), 6L) - } - - return( - c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time, - are_factor) - ) -} - -#' Scan through a logical column -#' -#' @param x The input vector -#' -#' @return A numeric vector with the proportion of the different types of data -#' that were detected within the input vector. -#' @keywords internal -#' -scan_in_logical <- function(x) { - are_factor <- are_date <- are_date_time <- are_character <- are_numeric <- - are_logical <- are_na <- 0.0 - - # logical are simply logical. We will only determine the %NA and %logical - # save the variable length - n_rows <- length(x) - - # get the proportion of NA - are_na <- round((sum(is.na(x)) / n_rows), 6L) - x <- x[!is.na(x)] - - # get the proportion of logical - are_logical <- round((length(x) / n_rows), 6L) - return( - c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time, - are_factor) - ) -} - #' Scan through a character column #' -#' @param x The input vector +#' @param x The input character vector #' #' @return A numeric vector with the proportion of the different types of data #' that were detected within the input vector. @@ -161,62 +66,82 @@ scan_in_logical <- function(x) { #' scan_in_character <- function(x) { # There might be, within a character column, values of type: - # character, numeric, date, date-time, NA, and logical + # character, numeric, date (date or date-time), NA, and logical # In this function, we check the presence of these different types within a # character column. - # Note that numeric values can actually be of 'Date' or 'date-time' type. - # Given that any numeric can be converted into Date, we will not check for - # Date or date-time values within the numeric. - - are_factor <- are_date <- are_date_time <- are_character <- are_numeric <- - are_logical <- are_na <- 0.0 # save the variable length - n_rows <- length(x) + initial_length <- length(x) - # get the proportion of NA - are_na <- round((sum(is.na(x)) / n_rows), 6L) + # get the count of missing data (NA) + na_count <- sum(is.na(x)) x <- x[!is.na(x)] - # get double values and evaluate the proportion numeric values - doubles <- x[!is.na(suppressWarnings(as.double(x)))] - if (length(doubles) > 0L) { - are_numeric <- round((length(doubles) / n_rows), 6L) - } - - # get character values and check for the presence of Date and date-time - characters <- x[is.na(suppressWarnings(as.double(x)))] - if (length(characters) > 0L && - !is.null(lubridate::guess_formats(characters, - c("ymd", "ydm", "dmy", "mdy", "myd", - "dym", "Ymd", "Ydm", "dmY", "mdY", - "mYd", "dYm")))) { - # get the proportion of date values - tmp <- suppressWarnings( - as.Date( - lubridate::parse_date_time( - characters, - orders = c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm", - "dmY", "mdY", "mYd", "dYm") - ) + # We will check if there is any Date values within the variable by parsing the + # values, looking for the ones that fit any of the predefined format. + # When there is one or more Date values, we will convert the variable into + # numeric and determine if any of them is a Date (a numeric, which after + # conversion to Date, fall within the interval + # [50 years back from today's date, today's date]). That way the Date count is + # the count of date identified from the parsing + the count of Dates within + # the numeric values. + # When there is no Date values identified from the parsing, the variable + # is converted into numeric. The numeric count is the sum of numeric values. + # The logical count is the number of TRUE and FALSE written in both lower + # and upper case within the variable + # The remaining values will be of type character. + + # parsing the vector, looking for date values + tmp <- suppressWarnings( + as.Date( + lubridate::parse_date_time( + x, + orders = c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm", + "dmY", "mdY", "mYd", "dYm") ) ) - are_date <- round((sum(!is.na(tmp)) / n_rows), 6L) - x <- x[is.na(tmp)] - characters <- characters[is.na(tmp)] + ) + + # getting the date and numeric count as describe above + date_count <- numeric_count <- 0L + if (sum(!is.na(tmp)) > 0L) { + # Setting the first date to 50 years before the current date + target_interval <- sort( + seq.Date(Sys.Date(), length.out = 2L, by = "-50 years") + ) + + # get the date count + date_count <- date_count + sum(!is.na(tmp)) + + # convert to numeric and check for the presence of Date among the numeric + tmp2 <- x[is.na(tmp)] + tmp3 <- suppressWarnings(as.numeric(tmp2)) + if (sum(!is.na(tmp3)) > 0L) { + y <- lubridate::as_date( + tmp3[!is.na(tmp3)], + origin = target_interval[[1L]] + ) + # second count of date values coming from date within numeric + date_count <- date_count + sum(!is.na(y)) + numeric_count <- sum(is.na(y)) + } + } else { + tmp <- suppressWarnings(as.numeric(x)) + numeric_count <- sum(!is.na(tmp)) } - # get the proportion of logical values - logicals <- toupper(characters) == "TRUE" | toupper(characters) == "FALSE" - are_logical <- round((sum(logicals) / n_rows), 6L) + # get logical count + logicals <- toupper(x) == "TRUE" | toupper(x) == "FALSE" + logical_count <- sum(logicals) - # get the proportion of character values - are_character <- round((1.0 - (are_na + are_numeric + - are_date + are_logical)), 6L) + # get the character count + character_count <- initial_length - + (na_count + logical_count + numeric_count + date_count) - # return the output - return( - c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time, - are_factor) - ) + # transform into proportions + props <- round( + c(na_count, numeric_count, date_count, character_count, logical_count) / + initial_length, 4L) + + return(props) } diff --git a/inst/rmarkdown/templates/printing-rmd/skeleton/skeleton.Rmd b/inst/rmarkdown/templates/printing-rmd/skeleton/skeleton.Rmd index 837d556c..e13e407b 100644 --- a/inst/rmarkdown/templates/printing-rmd/skeleton/skeleton.Rmd +++ b/inst/rmarkdown/templates/printing-rmd/skeleton/skeleton.Rmd @@ -59,7 +59,7 @@ p.compact { ```{r cleanepi-source-data, eval=TRUE, echo=FALSE} # EXTRACT THE REPORT SECTIONS scanning_result <- params[["scanning_result"]] -is_data_scanned <- !is.null(scanning_result) +is_data_scanned <- !is.na(scanning_result) standardized_column_names <- params[["colnames"]] are_column_standardised <- !is.null(standardized_column_names) out_of_range_dates <- params[["out_of_range_dates"]] diff --git a/man/scan_columns.Rd b/man/scan_columns.Rd deleted file mode 100644 index b359875d..00000000 --- a/man/scan_columns.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/clean_data_helpers.R -\name{scan_columns} -\alias{scan_columns} -\title{Calculate the percentage of missing and other data type values in a vector -containing different data types such as numeric, Date, character, -logical, date-time, factor.} -\usage{ -scan_columns(x, type) -} -\arguments{ -\item{x}{A vector of ones or a combination of various data types.} - -\item{type}{A character with the the vector type.} -} -\value{ -A vector of 5 elements representing the percentage of missing, -numeric, date, character, and logical values found in the input vector. -} -\description{ -Calculate the percentage of missing and other data type values in a vector -containing different data types such as numeric, Date, character, -logical, date-time, factor. -} -\keyword{internal} diff --git a/man/scan_data.Rd b/man/scan_data.Rd index cd6b62dd..90a2ee21 100644 --- a/man/scan_data.Rd +++ b/man/scan_data.Rd @@ -2,9 +2,8 @@ % Please edit documentation in R/clean_data_helpers.R \name{scan_data} \alias{scan_data} -\title{Scan a data frame to determine the percentage of \code{missing}, \code{numeric}, -\code{Date}, \code{character}, \code{logical}, \code{date-time}, and \code{factor} values in every -column.} +\title{Scan through all character columns of a data frame to determine the +proportion of \code{missing}, \code{numeric}, \code{Date}, \code{character}, \code{logical}, values.} \usage{ scan_data(data) } @@ -12,25 +11,37 @@ scan_data(data) \item{data}{A data frame or linelist} } \value{ -A data frame or linelist with the same number of rows as the number -of columns of the input data, and 8 column representing the field names, -the percentage of missing, numeric, date, character, logical, date-time, -and factor values in each column. +A data frame with the same number of rows as the number of character +columns of the input data, and six (06) columns representing the field +names, the percentage of missing, numeric, date, character, and logical, +values in each column. } \description{ -Scan a data frame to determine the percentage of \code{missing}, \code{numeric}, -\code{Date}, \code{character}, \code{logical}, \code{date-time}, and \code{factor} values in every -column. -} -\details{ -For columns of type character, the detected numeric values could actually be -of type Date or date-time. This is because R coerces some Date values into -numeric when the date is imported from an MS Excel file. +Scan through all character columns of a data frame to determine the +proportion of \code{missing}, \code{numeric}, \code{Date}, \code{character}, \code{logical}, values. } \examples{ +# scan through a data frame of characters scan_result <- scan_data( data = readRDS(system.file("extdata", "messy_data.RDS", package = "cleanepi")) ) +# scan through a data frame with two character columns +scan_result <- scan_data( + data = readRDS(system.file("extdata", "test_linelist.RDS", + package = "cleanepi")) + ) + +# scan through a data frame with no character columns +data(iris) +iris[["fct"]] <- as.factor(sample(c("gray", "orange"), nrow(iris), + replace = TRUE)) +iris[["lgl"]] <- sample(c(TRUE, FALSE), nrow(iris), replace = TRUE) +iris[["date"]] <- as.Date(seq.Date(from = as.Date("2024-01-01"), + to = as.Date("2024-08-30"), + length.out = nrow(iris))) +iris[["posit_ct"]] <- as.POSIXct(iris[["date"]]) +scan_result <- scan_data(data = iris) + } diff --git a/man/scan_in_character.Rd b/man/scan_in_character.Rd index e1e912fc..83559951 100644 --- a/man/scan_in_character.Rd +++ b/man/scan_in_character.Rd @@ -7,7 +7,7 @@ scan_in_character(x) } \arguments{ -\item{x}{The input vector} +\item{x}{The input character vector} } \value{ A numeric vector with the proportion of the different types of data diff --git a/man/scan_in_double.Rd b/man/scan_in_double.Rd deleted file mode 100644 index 3c277218..00000000 --- a/man/scan_in_double.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/clean_data_helpers.R -\name{scan_in_double} -\alias{scan_in_double} -\title{Scan through a double column} -\usage{ -scan_in_double(x) -} -\arguments{ -\item{x}{The input vector} -} -\value{ -A numeric vector with the proportion of the different types of data -that were detected within the input vector. -} -\description{ -Scan through a double column -} -\keyword{internal} diff --git a/man/scan_in_integer.Rd b/man/scan_in_integer.Rd deleted file mode 100644 index d807eed6..00000000 --- a/man/scan_in_integer.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/clean_data_helpers.R -\name{scan_in_integer} -\alias{scan_in_integer} -\title{Scan through an integer column} -\usage{ -scan_in_integer(x) -} -\arguments{ -\item{x}{The input vector} -} -\value{ -A numeric vector with the proportion of the different types of data -that were detected within the input vector. -} -\description{ -Scan through an integer column -} -\keyword{internal} diff --git a/man/scan_in_logical.Rd b/man/scan_in_logical.Rd deleted file mode 100644 index 61f22803..00000000 --- a/man/scan_in_logical.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/clean_data_helpers.R -\name{scan_in_logical} -\alias{scan_in_logical} -\title{Scan through a logical column} -\usage{ -scan_in_logical(x) -} -\arguments{ -\item{x}{The input vector} -} -\value{ -A numeric vector with the proportion of the different types of data -that were detected within the input vector. -} -\description{ -Scan through a logical column -} -\keyword{internal} diff --git a/tests/testthat/test-clean_data_helpers.R b/tests/testthat/test-clean_data_helpers.R index d504f0f7..880b2c9d 100644 --- a/tests/testthat/test-clean_data_helpers.R +++ b/tests/testthat/test-clean_data_helpers.R @@ -1,15 +1,15 @@ test_that("scan_data works as expected", { - dat <- readRDS(system.file("extdata", "messy_data.RDS", - package = "cleanepi")) + # using a dataset with character columns only + dat <- readRDS(system.file("extdata", "messy_data.RDS", package = "cleanepi")) scan_result <- scan_data(data = dat) expect_s3_class(scan_result, "data.frame") expect_named(scan_result, c("Field_names", "missing", "numeric", "date", - "character", "logical", "date-time", "factor")) - expect_identical(ncol(scan_result), 8L) + "character", "logical")) + expect_identical(ncol(scan_result), 6L) expect_identical(nrow(scan_result), ncol(dat)) expect_identical(scan_result[["Field_names"]], names(dat)) - # using a dataset with many data types + # using a dataset with no character column data(iris) iris[["fct"]] <- as.factor(sample(c("gray", "orange"), nrow(iris), replace = TRUE)) @@ -19,14 +19,16 @@ test_that("scan_data works as expected", { length.out = nrow(iris))) iris[["posit_ct"]] <- as.POSIXct(iris[["date"]]) scan_result <- scan_data(data = iris) - expect_identical(ncol(scan_result), 8L) - expect_identical(nrow(scan_result), ncol(iris)) - expect_identical(scan_result[["Field_names"]], names(iris)) - expect_identical(sum(scan_result[["numeric"]]), 4) - expect_identical(sum(scan_result[["missing"]]), 0) - expect_identical(sum(scan_result[["date"]]), 1) - expect_identical(sum(scan_result[["character"]]), 0) - expect_identical(sum(scan_result[["logical"]]), 1) - expect_identical(sum(scan_result[["date-time"]]), 1) - expect_identical(sum(scan_result[["factor"]]), 2) + expect_identical(scan_result, NA) + expect_message(scan_data(data = iris), + "No character column found in the provided data.") + + # using a data with some character columns + dat <- readRDS(system.file("extdata", "test_linelist.RDS", + package = "cleanepi")) + scan_result <- suppressWarnings(scan_data(data = dat)) + expect_identical(ncol(scan_result), 6L) + expect_identical(nrow(scan_result), 2L) + expect_false(nrow(scan_result) == ncol(dat)) + expect_identical(scan_result[["Field_names"]], c("id", "age_class")) }) diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 1f188491..dafb13f5 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -112,8 +112,8 @@ test_that("add_to_report works as expected", { expect_named(report, "scanning_result") expect_named(report[["scanning_result"]], c("Field_names", "missing", "numeric", "date", "character", - "logical", "date-time", "factor")) - expect_identical(nrow(report[["scanning_result"]]), ncol(data)) + "logical")) + expect_identical(nrow(report[["scanning_result"]]), 6L) })