From 43874329ce0742526806f67ef97a9412525f1482 Mon Sep 17 00:00:00 2001 From: egillax Date: Thu, 12 Dec 2024 14:55:24 +0100 Subject: [PATCH] add simple imputer for mean and median imputation --- NAMESPACE | 1 + R/Imputation.R | 198 +++++++++++++++++++++++++++++++++---- R/PreprocessingData.R | 5 +- man/createNormalization.Rd | 7 +- man/createSimpleImputer.Rd | 18 ++++ man/simpleImpute.Rd | 21 ++++ 6 files changed, 222 insertions(+), 28 deletions(-) create mode 100644 man/createSimpleImputer.Rd create mode 100644 man/simpleImpute.Rd diff --git a/NAMESPACE b/NAMESPACE index a3a8f7e7..173f8d29 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -36,6 +36,7 @@ export(createRandomForestFeatureSelection) export(createRareFeatureRemover) export(createRestrictPlpDataSettings) export(createSampleSettings) +export(createSimpleImputer) export(createSplineSettings) export(createStratifiedImputationSettings) export(createStudyPopulation) diff --git a/R/Imputation.R b/R/Imputation.R index 227526c1..43c58da4 100644 --- a/R/Imputation.R +++ b/R/Imputation.R @@ -15,6 +15,184 @@ # See the License for the specific language governing permissions and # limitations under the License. +#' @title Create Iterative Imputer settings +#' @description This function creates the settings for an iterative imputer +#' which first removes features with more than `missingThreshold` missing values +#' and then imputes the missing values iteratively using chained equations +#' @param missingThreshold The threshold for missing values to remove a feature +#' @param method The method to use for imputation, currently only "pmm" is supported +#' @return The settings for the single imputer of class `featureEngineeringSettings` +#' @export +createIterativeImputer <- function(missingThreshold = 0.3, + method = "pmm") { + featureEngineeringSettings <- list( + missingThreshold = missingThreshold, + method = method + ) + if (method == "pmm") { + # at the moment this requires glmnet + rlang::check_installed("glmnet") + } + attr(featureEngineeringSettings, "fun") <- "iterativeImpute" + + class(featureEngineeringSettings) <- "featureEngineeringSettings" + return(featureEngineeringSettings) +} + +#' @title Create Simple Imputer settings +#' @description This function creates the settings for a simple imputer +#' which imputes missing values with the mean or median +#' @param method The method to use for imputation, either "mean" or "median" +#' @return The settings for the single imputer of class `featureEngineeringSettings` +#' @export +createSimpleImputer <- function(method = "mean", + missingThreshold = 0.3) { + checkIsClass(method, "character") + checkInStringVector(method, c("mean", "median")) + featureEngineeringSettings <- list( + method = method, + missingThreshold = missingThreshold + ) + attr(featureEngineeringSettings, "fun") <- "simpleImpute" + + class(featureEngineeringSettings) <- "featureEngineeringSettings" + return(featureEngineeringSettings) +} + +#' @title Simple Imputation +#' @description This function does single imputation with the mean or median +#' @param trainData The data to be imputed +#' @param featureEngineeringSettings The settings for the imputation +#' @param done Whether the imputation has already been done (bool) +#' @return The imputed data +simpleImpute <- function(trainData, featureEngineeringSettings, done = FALSE) { + if (!done) { + missingInfo <- extractMissingInfo(trainData) + trainData$covariateData$missingInfo <- missingInfo$missingInfo + continuousFeatures <- missingInfo$continuousFeatures + on.exit(trainData$covariateData$missingInfo <- NULL, add = TRUE) + + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::left_join(trainData$covariateData$missingInfo, by = "covariateId") %>% + dplyr::filter(is.na(.data$missing) || + .data$missing <= featureEngineeringSettings$missingThreshold) %>% + dplyr::select(-"missing") + + # separate the continuous and binary features + featureData <- separateFeatures(trainData, continuousFeatures) + numericData <- featureData[[1]] + on.exit(numericData <- NULL, add = TRUE) + + allRowIds <- numericData$covariates %>% + dplyr::pull(.data$rowId) %>% + unique() %>% + sort() + allColumnIds <- numericData$covariates %>% + dplyr::pull(.data$covariateId) %>% + unique() %>% + sort() + completeIds <- expand.grid(rowId = allRowIds, covariateId = allColumnIds) + numericData$covariates <- merge(completeIds, numericData$covariates, + all.x = TRUE + ) + + if (featureEngineeringSettings$method == "mean") { + numericData$imputedValues <- numericData$covariates %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise(imputedValues = mean(.data$covariateValue, na.rm = TRUE)) + } else if (featureEngineeringSettings$method == "median") { + numericData$imputedValues <- numericData$covariates %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::collect() %>% # median not possible in sql + dplyr::summarise(imputedValues = median(.data$covariateValue, na.rm = TRUE)) + } + + + numericData$imputedCovariates <- numericData$covariates %>% + dplyr::left_join(numericData$imputedValues, by = "covariateId") %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::mutate(imputedValue = ifelse(is.na(.data$covariateValue), + .data$imputedValues, + .data$covariateValue + )) %>% + dplyr::select(-c("imputedValues")) + Andromeda::appendToTable( + trainData$covariateData$covariates, + numericData$imputedCovariates %>% + dplyr::filter(is.na(.data$covariateValue)) %>% + dplyr::mutate(covariateValue = .data$imputedValue) %>% + dplyr::select(-c("imputedValue")) + ) + attr(featureEngineeringSettings, "missingInfo") <- + trainData$covariateData$missingInfo %>% + dplyr::collect() + attr(featureEngineeringSettings, "imputer") <- + numericData$imputedValues %>% dplyr::collect() + done <- TRUE + } else { + trainData$covariateData$missingInfo <- attr( + featureEngineeringSettings, + "missingInfo" + ) + on.exit(trainData$covariateData$missingInfo <- NULL, add = TRUE) + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::left_join(trainData$covariateData$missingInfo, by = "covariateId") %>% + dplyr::filter(is.na(.data$missing) || + .data$missing <= featureEngineeringSettings$missingThreshold) %>% + dplyr::select(-"missing") + + continuousFeatures <- trainData$covariateData$analysisRef %>% + dplyr::filter(.data$isBinary == "N") %>% + dplyr::select("analysisId") %>% + dplyr::inner_join(trainData$covariateData$covariateRef, by = "analysisId") %>% + dplyr::pull(.data$covariateId) + featureData <- separateFeatures(trainData, continuousFeatures) + numericData <- featureData[[1]] + on.exit(numericData <- NULL, add = TRUE) + # impute missing values + allRowIds <- numericData$covariates %>% + dplyr::pull(.data$rowId) %>% + unique() %>% + sort() + allColumnIds <- numericData$covariates %>% + dplyr::pull(.data$covariateId) %>% + unique() %>% + sort() + completeIds <- expand.grid(rowId = allRowIds, covariateId = allColumnIds) + numericData$covariates <- merge(completeIds, numericData$covariates, + all.x = TRUE + ) + numericData$imputedValues <- attr(featureEngineeringSettings, "imputer") + numericData$imputedCovariates <- numericData$covariates %>% + dplyr::left_join(numericData$imputedValues, by = "covariateId") %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::mutate(imputedValue = ifelse(is.na(.data$covariateValue), + .data$imputedValues, + .data$covariateValue + )) %>% + dplyr::select(-c("imputedValues")) + Andromeda::appendToTable( + trainData$covariateData$covariates, + numericData$imputedCovariates %>% + dplyr::filter(is.na(.data$covariateValue)) %>% + dplyr::mutate(covariateValue = .data$imputedValue) %>% + dplyr::select(-c("imputedValue")) + ) + + } + featureEngineering <- list( + funct = "simpleImpute", + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + done = done + ) + ) + attr(trainData, "metaData")$featureEngineering[["simpleImputer"]] <- + featureEngineering + return(trainData) +} + + #' @title Imputation #' @description This function does single imputation with predictive mean matchin #' @param trainData The data to be imputed @@ -194,26 +372,6 @@ iterativeImpute <- function(trainData, featureEngineeringSettings, done = FALSE) return(trainData) } -#' @title Create Iterative Imputer settings -#' @description This function creates the settings for an iterative imputer -#' which first removes features with more than `missingThreshold` missing values -#' and then imputes the missing values iteratively using chained equations -#' @param missingThreshold The threshold for missing values to remove a feature -#' @param method The method to use for imputation, currently only "pmm" is supported -#' @return The settings for the single imputer of class `featureEngineeringSettings` -#' @export -createIterativeImputer <- function(missingThreshold = 0.3, - method = "pmm") { - featureEngineeringSettings <- list( - missingThreshold = missingThreshold, - method = method - ) - attr(featureEngineeringSettings, "fun") <- "iterativeImpute" - - class(featureEngineeringSettings) <- "featureEngineeringSettings" - return(featureEngineeringSettings) -} - #' @title Predictive mean matching using lasso #' @param numericData An andromeda object with the following fields: #' xObs: covariates table for observed data diff --git a/R/PreprocessingData.R b/R/PreprocessingData.R index 8ea17f45..f7e53576 100644 --- a/R/PreprocessingData.R +++ b/R/PreprocessingData.R @@ -239,13 +239,12 @@ robustNormalize <- function(trainData, featureEngineeringSettings, normalized = ) ) - attr(trainData, "metaData")$featureEngineering[['robustNormalize']] <- + attr(trainData, "metaData")$featureEngineering[["robustNormalize"]] <- featureEngineering return(trainData) } -#' Create the settings for normalizing the data -#' @param type The type of normalization to use, either "minmax" or "robust" +#' Create the settings for normalizing the data @param type The type of normalization to use, either "minmax" or "robust" #' @return An object of class \code{featureEngineeringSettings} #' @export createNormalization <- function(type = "minmax") { diff --git a/man/createNormalization.Rd b/man/createNormalization.Rd index fa20c6df..6a73cfb3 100644 --- a/man/createNormalization.Rd +++ b/man/createNormalization.Rd @@ -2,16 +2,13 @@ % Please edit documentation in R/PreprocessingData.R \name{createNormalization} \alias{createNormalization} -\title{Create the settings for normalizing the data} +\title{Create the settings for normalizing the data @param type The type of normalization to use, either "minmax" or "robust"} \usage{ createNormalization(type = "minmax") } -\arguments{ -\item{type}{The type of normalization to use, either "minmax" or "robust"} -} \value{ An object of class \code{featureEngineeringSettings} } \description{ -Create the settings for normalizing the data +Create the settings for normalizing the data @param type The type of normalization to use, either "minmax" or "robust" } diff --git a/man/createSimpleImputer.Rd b/man/createSimpleImputer.Rd new file mode 100644 index 00000000..46b2ef5c --- /dev/null +++ b/man/createSimpleImputer.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Imputation.R +\name{createSimpleImputer} +\alias{createSimpleImputer} +\title{Create Simple Imputer settings} +\usage{ +createSimpleImputer(method = "mean", missingThreshold = 0.3) +} +\arguments{ +\item{method}{The method to use for imputation, either "mean" or "median"} +} +\value{ +The settings for the single imputer of class `featureEngineeringSettings` +} +\description{ +This function creates the settings for a simple imputer +which imputes missing values with the mean or median +} diff --git a/man/simpleImpute.Rd b/man/simpleImpute.Rd new file mode 100644 index 00000000..619cb6bf --- /dev/null +++ b/man/simpleImpute.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Imputation.R +\name{simpleImpute} +\alias{simpleImpute} +\title{Simple Imputation} +\usage{ +simpleImpute(trainData, featureEngineeringSettings, done = FALSE) +} +\arguments{ +\item{trainData}{The data to be imputed} + +\item{featureEngineeringSettings}{The settings for the imputation} + +\item{done}{Whether the imputation has already been done (bool)} +} +\value{ +The imputed data +} +\description{ +This function does single imputation with the mean or median +}