Skip to content

Commit

Permalink
update the scan_date() function - only scan through character columns
Browse files Browse the repository at this point in the history
  • Loading branch information
Karim-Mane committed Aug 1, 2024
1 parent 0d11788 commit 272cb8b
Show file tree
Hide file tree
Showing 10 changed files with 151 additions and 295 deletions.
283 changes: 104 additions & 179 deletions R/clean_data_helpers.R
Original file line number Diff line number Diff line change
@@ -1,222 +1,147 @@

#' Calculate the percentage of missing and other data type values in a vector
#' containing different data types such as numeric, Date, character,
#' logical, date-time, factor.
#'
#' @param x A vector of ones or a combination of various data types.
#' @param type A character with the the vector type.
#'
#' @returns A vector of 5 elements representing the percentage of missing,
#' numeric, date, character, and logical values found in the input vector.
#'
#' @keywords internal
#'
scan_columns <- function(x, type) {
res <- switch(type,
double = scan_in_double(x),
integer = scan_in_integer(x),
logical = scan_in_logical(x),
character = scan_in_character(x))
return(res)
}

#' Scan a data frame to determine the percentage of `missing`, `numeric`,
#' `Date`, `character`, `logical`, `date-time`, and `factor` values in every
#' column.
#' Scan through all character columns of a data frame to determine the
#' proportion of `missing`, `numeric`, `Date`, `character`, `logical`, values.
#'
#' @param data A data frame or linelist
#'
#' @returns A data frame or linelist with the same number of rows as the number
#' of columns of the input data, and 8 column representing the field names,
#' the percentage of missing, numeric, date, character, logical, date-time,
#' and factor values in each column.
#' @returns A data frame with the same number of rows as the number of character
#' columns of the input data, and six (06) columns representing the field
#' names, the percentage of missing, numeric, date, character, and logical,
#' values in each column.
#'
#' @export
#'
#' @examples
#' # scan through a data frame of characters
#' scan_result <- scan_data(
#' data = readRDS(system.file("extdata", "messy_data.RDS",
#' package = "cleanepi"))
#' )
#'
#' @details
#' For columns of type character, the detected numeric values could actually be
#' of type Date or date-time. This is because R coerces some Date values into
#' numeric when the date is imported from an MS Excel file.
#' # scan through a data frame with two character columns
#' scan_result <- scan_data(
#' data = readRDS(system.file("extdata", "test_linelist.RDS",
#' package = "cleanepi"))
#' )
#'
#' # scan through a data frame with no character columns
#' data(iris)
#' iris[["fct"]] <- as.factor(sample(c("gray", "orange"), nrow(iris),
#' replace = TRUE))
#' iris[["lgl"]] <- sample(c(TRUE, FALSE), nrow(iris), replace = TRUE)
#' iris[["date"]] <- as.Date(seq.Date(from = as.Date("2024-01-01"),
#' to = as.Date("2024-08-30"),
#' length.out = nrow(iris)))
#' iris[["posit_ct"]] <- as.POSIXct(iris[["date"]])
#' scan_result <- scan_data(data = iris)
#'
scan_data <- function(data) {
types <- vapply(data, typeof, character(1L))
# scan through all columns of the data and the identify character columns
types <- vapply(data, typeof, character(1L))
target_columns <- types[types == "character"]

# send an message if there is no character column found within the input data
if (length(target_columns) == 0L) {
message("No character column found in the provided data.")
return(invisible(NA))
}

# scan through the character columns
data <- data[, names(target_columns)]
scan_result <- vapply(seq_len(ncol(data)), function(col_index) {
scan_columns(data[[col_index]], types[[col_index]])
}, numeric(7L))
scan_in_character(data[[col_index]])
}, numeric(5L))
scan_result <- as.data.frame(t(scan_result))
names(scan_result) <- c("missing", "numeric", "date", "character",
"logical", "date-time", "factor")
names(scan_result) <- c("missing", "numeric", "date", "character", "logical")
scan_result <- cbind(Field_names = names(data), scan_result)
return(scan_result)
}

#' Scan through a double column
#'
#' @param x The input vector
#'
#' @return A numeric vector with the proportion of the different types of data
#' that were detected within the input vector.
#' @keywords internal
#'
scan_in_double <- function(x) {
are_factor <- are_date <- are_date_time <- are_character <- are_numeric <-
are_logical <- are_na <- 0.0
# save the variable length
n_rows <- length(x)

# get the proportion of NA
are_na <- round((sum(is.na(x)) / n_rows), 6L)
x <- x[!is.na(x)]

# doubles are either numeric (attributes = NULL), or Date (has a 'class'
# attributes = Date), or date-time (has a 'class' attributes = POSIXt)
if ("class" %in% names(attributes(x))) {
if ("Date" %in% attributes(x)[["class"]]) {
are_date <- round((length(x) / n_rows), 6L)
} else if ("POSIXt" %in% attributes(x)[["class"]]) {
are_date_time <- round((length(x) / n_rows), 6L)
}
} else {
are_numeric <- round((length(x) / n_rows), 6L)
}
return(
c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time,
are_factor)
)
}

#' Scan through an integer column
#'
#' @param x The input vector
#'
#' @return A numeric vector with the proportion of the different types of data
#' that were detected within the input vector.
#' @keywords internal
#'
scan_in_integer <- function(x) {
are_factor <- are_date <- are_date_time <- are_character <- are_numeric <-
are_logical <- are_na <- 0.0
# save the variable length
n_rows <- length(x)

# get the proportion of NA
are_na <- round((sum(is.na(x)) / n_rows), 6L)
x <- x[!is.na(x)]

# integers are either numeric (attributes = NULL), or factors (has a 'class'
# and 'levels' attributes)
if (is.null(attributes(x))) {
are_numeric <- round((length(x) / n_rows), 6L)
} else if (identical(names(attributes(x)), c("levels", "class"))) {
are_factor <- round((length(x) / n_rows), 6L)
}

return(
c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time,
are_factor)
)
}

#' Scan through a logical column
#'
#' @param x The input vector
#'
#' @return A numeric vector with the proportion of the different types of data
#' that were detected within the input vector.
#' @keywords internal
#'
scan_in_logical <- function(x) {
are_factor <- are_date <- are_date_time <- are_character <- are_numeric <-
are_logical <- are_na <- 0.0

# logical are simply logical. We will only determine the %NA and %logical
# save the variable length
n_rows <- length(x)

# get the proportion of NA
are_na <- round((sum(is.na(x)) / n_rows), 6L)
x <- x[!is.na(x)]

# get the proportion of logical
are_logical <- round((length(x) / n_rows), 6L)
return(
c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time,
are_factor)
)
}

#' Scan through a character column
#'
#' @param x The input vector
#' @param x The input character vector
#'
#' @return A numeric vector with the proportion of the different types of data
#' that were detected within the input vector.
#' @keywords internal
#'
scan_in_character <- function(x) {
# There might be, within a character column, values of type:
# character, numeric, date, date-time, NA, and logical
# character, numeric, date (date or date-time), NA, and logical
# In this function, we check the presence of these different types within a
# character column.
# Note that numeric values can actually be of 'Date' or 'date-time' type.
# Given that any numeric can be converted into Date, we will not check for
# Date or date-time values within the numeric.

are_factor <- are_date <- are_date_time <- are_character <- are_numeric <-
are_logical <- are_na <- 0.0

# save the variable length
n_rows <- length(x)
initial_length <- length(x)

# get the proportion of NA
are_na <- round((sum(is.na(x)) / n_rows), 6L)
# get the count of missing data (NA)
na_count <- sum(is.na(x))
x <- x[!is.na(x)]

# get double values and evaluate the proportion numeric values
doubles <- x[!is.na(suppressWarnings(as.double(x)))]
if (length(doubles) > 0L) {
are_numeric <- round((length(doubles) / n_rows), 6L)
}

# get character values and check for the presence of Date and date-time
characters <- x[is.na(suppressWarnings(as.double(x)))]
if (length(characters) > 0L &&
!is.null(lubridate::guess_formats(characters,
c("ymd", "ydm", "dmy", "mdy", "myd",
"dym", "Ymd", "Ydm", "dmY", "mdY",
"mYd", "dYm")))) {
# get the proportion of date values
tmp <- suppressWarnings(
as.Date(
lubridate::parse_date_time(
characters,
orders = c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm",
"dmY", "mdY", "mYd", "dYm")
)
# We will check if there is any Date values within the variable by parsing the
# values, looking for the ones that fit any of the predefined format.
# When there is one or more Date values, we will convert the variable into
# numeric and determine if any of them is a Date (a numeric, which after
# conversion to Date, fall within the interval
# [50 years back from today's date, today's date]). That way the Date count is
# the count of date identified from the parsing + the count of Dates within
# the numeric values.
# When there is no Date values identified from the parsing, the variable
# is converted into numeric. The numeric count is the sum of numeric values.
# The logical count is the number of TRUE and FALSE written in both lower
# and upper case within the variable
# The remaining values will be of type character.

# parsing the vector, looking for date values
tmp <- suppressWarnings(
as.Date(
lubridate::parse_date_time(
x,
orders = c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm",
"dmY", "mdY", "mYd", "dYm")
)
)
are_date <- round((sum(!is.na(tmp)) / n_rows), 6L)
x <- x[is.na(tmp)]
characters <- characters[is.na(tmp)]
)

# getting the date and numeric count as describe above
date_count <- numeric_count <- 0L
if (sum(!is.na(tmp)) > 0L) {
# Setting the first date to 50 years before the current date
target_interval <- sort(
seq.Date(Sys.Date(), length.out = 2L, by = "-50 years")
)

# get the date count
date_count <- date_count + sum(!is.na(tmp))

# convert to numeric and check for the presence of Date among the numeric
tmp2 <- x[is.na(tmp)]
tmp3 <- suppressWarnings(as.numeric(tmp2))
if (sum(!is.na(tmp3)) > 0L) {
y <- lubridate::as_date(
tmp3[!is.na(tmp3)],
origin = target_interval[[1L]]
)
# second count of date values coming from date within numeric
date_count <- date_count + sum(!is.na(y))
numeric_count <- sum(is.na(y))
}
} else {
tmp <- suppressWarnings(as.numeric(x))
numeric_count <- sum(!is.na(tmp))
}

# get the proportion of logical values
logicals <- toupper(characters) == "TRUE" | toupper(characters) == "FALSE"
are_logical <- round((sum(logicals) / n_rows), 6L)
# get logical count
logicals <- toupper(x) == "TRUE" | toupper(x) == "FALSE"
logical_count <- sum(logicals)

# get the proportion of character values
are_character <- round((1.0 - (are_na + are_numeric +
are_date + are_logical)), 6L)
# get the character count
character_count <- initial_length -
(na_count + logical_count + numeric_count + date_count)

# return the output
return(
c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time,
are_factor)
)
# transform into proportions
props <- round(
c(na_count, numeric_count, date_count, character_count, logical_count) /
initial_length, 4L)

return(props)
}
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ p.compact {
```{r cleanepi-source-data, eval=TRUE, echo=FALSE}
# EXTRACT THE REPORT SECTIONS
scanning_result <- params[["scanning_result"]]
is_data_scanned <- !is.null(scanning_result)
is_data_scanned <- !is.na(scanning_result)
standardized_column_names <- params[["colnames"]]
are_column_standardised <- !is.null(standardized_column_names)
out_of_range_dates <- params[["out_of_range_dates"]]
Expand Down
25 changes: 0 additions & 25 deletions man/scan_columns.Rd

This file was deleted.

Loading

0 comments on commit 272cb8b

Please sign in to comment.