diff --git a/DESCRIPTION b/DESCRIPTION index 1065bba9..8a6713d4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -41,6 +41,7 @@ Imports: utils Suggests: Eunomia (>= 2.0.0), + glmnet, ggplot2, gridExtra, IterativeHardThresholding, diff --git a/NAMESPACE b/NAMESPACE index c2f7f620..cdb0bef8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -25,14 +25,18 @@ export(createExecuteSettings) export(createExistingSplitSettings) export(createFeatureEngineeringSettings) export(createGlmModel) +export(createIterativeImputer) export(createLearningCurve) export(createLogSettings) export(createModelDesign) +export(createNormalizer) export(createPlpResultTables) export(createPreprocessSettings) export(createRandomForestFeatureSelection) +export(createRareFeatureRemover) export(createRestrictPlpDataSettings) export(createSampleSettings) +export(createSimpleImputer) export(createSplineSettings) export(createStratifiedImputationSettings) export(createStudyPopulation) diff --git a/R/FeatureEngineering.R b/R/FeatureEngineering.R index 1e15f447..f61b40e2 100644 --- a/R/FeatureEngineering.R +++ b/R/FeatureEngineering.R @@ -82,11 +82,14 @@ createUnivariateFeatureSelection <- function(k = 100) { "reticulate", reason = "This function requires the reticulate package to be installed" ) - tryCatch({ - reticulate::import("sklearn") - }, error = function(e) { - stop("This function requires the scikit-learn package to be installed") - }) + tryCatch( + { + reticulate::import("sklearn") + }, + error = function(e) { + stop("This function requires the scikit-learn package to be installed") + } + ) checkIsClass(k, "integer") checkHigherEqual(k, 0) @@ -115,11 +118,14 @@ createRandomForestFeatureSelection <- function(ntrees = 2000, maxDepth = 17) { "reticulate", reason = "This function requires the reticulate package to be installed" ) - tryCatch({ - reticulate::import("sklearn") - }, error = function(e) { - stop("This function requires the scikit-learn package to be installed") - }) + tryCatch( + { + reticulate::import("sklearn") + }, + error = function(e) { + stop("This function requires the scikit-learn package to be installed") + } + ) checkIsClass(ntrees, c("numeric", "integer")) checkIsClass(maxDepth, c("numeric", "integer")) checkHigher(ntrees, 0) @@ -202,8 +208,8 @@ splineCovariates <- function( ) # add the feature engineering in - attr(trainData, "metaData")$featureEngineering <- listAppend( - attr(trainData, "metaData")$featureEngineering, + attr(trainData$covariateData, "metaData")$featureEngineering <- listAppend( + attr(trainData$covariateData, "metaData")$featureEngineering, featureEngineering ) ParallelLogger::logInfo("Finished splineCovariates") @@ -223,7 +229,7 @@ splineMap <- function( as.data.frame() designMatrix <- splines::bs( - x = measurements$covariateValue, + x = measurements$covariateValue, knots = knots[2:(length(knots) - 1)], Boundary.knots = knots[c(1, length(knots))] ) @@ -259,7 +265,8 @@ splineMap <- function( Andromeda::appendToTable( tbl = data$covariateData$covariateRef, data = data.frame( - covariateId = covariateId * 10000 + (1:(ncol(designMatrix))) * 1000 + analysisId, + covariateId = + covariateId * 10000 + (1:(ncol(designMatrix))) * 1000 + analysisId, covariateName = paste( paste0(covariateName, " spline component "), 1:ncol(designMatrix) @@ -455,8 +462,8 @@ univariateFeatureSelection <- function( ) ) - attr(trainData, "metaData")$featureEngineering <- listAppend( - attr(trainData, "metaData")$featureEngineering, + attr(trainData$covariateData, "metaData")$featureEngineering <- listAppend( + attr(trainData$covariateData, "metaData")$featureEngineering, featureEngineering ) @@ -516,10 +523,318 @@ randomForestFeatureSelection <- function( ) ) - attr(trainData, "metaData")$featureEngineering <- listAppend( - attr(trainData, "metaData")$featureEngineering, + attr(trainData$covariateData, "metaData")$featureEngineering <- listAppend( + attr(trainData$covariateData, "metaData")$featureEngineering, featureEngeering ) return(trainData) } + +#' Create the settings for normalizing the data @param type The type of normalization to use, either "minmax" or "robust" +#' @return An object of class \code{featureEngineeringSettings} +#' @param type The type of normalization to use, either "minmax" or "robust" +#' @return An object of class \code{featureEngineeringSettings}' +#' @export +createNormalizer <- function(type = "minmax") { + featureEngineeringSettings <- list( + type = type + ) + checkIsClass(type, "character") + checkInStringVector(type, c("minmax", "robust")) + if (type == "minmax") { + attr(featureEngineeringSettings, "fun") <- "minMaxNormalize" + } else if (type == "robust") { + attr(featureEngineeringSettings, "fun") <- "robustNormalize" + } + + class(featureEngineeringSettings) <- "featureEngineeringSettings" + return(featureEngineeringSettings) +} + +#' A function that normalizes continous features to have values between 0 and 1 +#' @details uses value - min / (max - min) to normalize the data +#' @param trainData The training data to be normalized +#' @param featureEngineeringSettings The settings for the normalization +#' @param done Whether the data has already been normalized (bool) +#' @return The normalized data +#' @keywords internal +minMaxNormalize <- function(trainData, featureEngineeringSettings, done = FALSE) { + start <- Sys.time() + if (!done) { + outData <- list( + labels = trainData$labels, + folds = trainData$folds, + covariateData = Andromeda::copyAndromeda(trainData$covariateData) + ) + ParallelLogger::logInfo("Starting min-max normalization of continuous features") + # fit the normalization + # find continuous features from trainData$covariateData$analysisRef + continousFeatures <- outData$covariateData$analysisRef %>% + dplyr::filter(.data$isBinary == "N") %>% + dplyr::select("analysisId") %>% + dplyr::inner_join(outData$covariateData$covariateRef, by = "analysisId") %>% + dplyr::pull(.data$covariateId) + + # get max of each feature + outData$covariateData$minMaxs <- outData$covariateData$covariates %>% + dplyr::filter(.data$covariateId %in% continousFeatures) %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise( + max = max(.data$covariateValue, na.rm = TRUE), + min = min(.data$covariateValue, na.rm = TRUE) + ) %>% + dplyr::collect() + on.exit(outData$covariateData$minMaxs <- NULL, add = TRUE) + + # save the normalization + attr(featureEngineeringSettings, "minMaxs") <- + outData$covariateData$minMaxs %>% dplyr::collect() + + # apply the normalization to trainData + outData$covariateData$covariates <- outData$covariateData$covariates %>% + dplyr::left_join(outData$covariateData$minMaxs, by = "covariateId") %>% + # use ifelse to only normalize if min and max are not NA as is the case + # for continous features, else return original value + dplyr::mutate(covariateValue = ifelse(!is.na(min) & !is.na(max), + (.data$covariateValue - min) / (max - min), + .data$covariateValue + )) %>% + dplyr::select(-c("max", "min")) + outData$covariateData$minMaxs <- NULL + done <- TRUE + } else { + ParallelLogger::logInfo("Applying min-max normalization of continuous features to test data") + outData <- list( + labels = trainData$labels, + folds = trainData$folds, + covariateData = Andromeda::copyAndromeda(trainData$covariateData) + ) + # apply the normalization to test data by using saved normalization values + outData$covariateData$covariates <- outData$covariateData$covariates %>% + dplyr::left_join(attr(featureEngineeringSettings, "minMaxs"), + by = "covariateId", copy = TRUE + ) %>% + dplyr::mutate(covariateValue = ifelse(!is.na(min) & !is.na(max), + (.data$covariateValue - min) / (max - min), + .data$covariateValue + )) %>% + dplyr::select(-c("max", "min")) + } + featureEngineering <- list( + funct = "minMaxNormalize", + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + done = done + ) + ) + + attr(outData$covariateData, "metaData")$featureEngineering[["minMaxNormalize"]] <- + featureEngineering + delta <- Sys.time() - start + ParallelLogger::logInfo(paste0( + "Finished min-max normalization of continuous features in ", + signif(delta, 3), " ", attr(delta, "units") + )) + return(outData) +} + +#' A function that normalizes continous by the interquartile range and forces +#' the resulting values to be between -3 and 3 with f(x) = x / sqrt(1 + (x/3)^2) +#' @details uses (value - median) / iqr to normalize the data and then +#' applies the function f(x) = x / sqrt(1 + (x/3)^2) to the normalized values. +#' This forces the values to be between -3 and 3 while preserving the relative +#' ordering of the values. +#' based on https://arxiv.org/abs/2407.04491 for more details +#' @param trainData The training data to be normalized +#' @param featureEngineeringSettings The settings for the normalization +#' @param done Whether the data has already been normalized (bool) +#' @return The normalized data +#' @keywords internal +robustNormalize <- function(trainData, featureEngineeringSettings, done = FALSE) { + start <- Sys.time() + if (!done) { + ParallelLogger::logInfo("Starting robust normalization of continuous features") + outData <- list( + labels = trainData$labels, + folds = trainData$folds, + covariateData = Andromeda::copyAndromeda(trainData$covariateData) + ) + # find continuous features from trainData$covariateData$analysisRef + continousFeatures <- outData$covariateData$analysisRef %>% + dplyr::filter(.data$isBinary == "N") %>% + dplyr::select("analysisId") %>% + dplyr::inner_join(outData$covariateData$covariateRef, by = "analysisId") %>% + dplyr::pull(.data$covariateId) + + # get (25, 75)% quantiles of each feature + # sqlite (used by Andromeda) doesn't have quantile function, so we need to load the extension + # to get upper_quartile and lower_quartile_functions + RSQLite::initExtension(outData$covariateData, "math") + + outData$covariateData$quantiles <- outData$covariateData$covariates %>% + dplyr::filter(.data$covariateId %in% continousFeatures) %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise( + q25 = dplyr::sql("lower_quartile(covariateValue)"), + q75 = dplyr::sql("upper_quartile(covariateValue)"), + median = stats::median(.data$covariateValue, na.rm = TRUE) + ) %>% + dplyr::mutate(iqr = .data$q75 - .data$q25) %>% + dplyr::select(-c("q75", "q25")) %>% + dplyr::collect() + on.exit(outData$covariateData$quantiles <- NULL, add = TRUE) + + # save the normalization + attr(featureEngineeringSettings, "quantiles") <- + outData$covariateData$quantiles %>% dplyr::collect() + + # apply the normalization to trainData + outData$covariateData$covariates <- outData$covariateData$covariates %>% + dplyr::left_join(outData$covariateData$quantiles, by = "covariateId") %>% + # use ifelse to only normalize continous features + dplyr::mutate(covariateValue = ifelse(!is.na(.data$iqr) & !is.na(.data$median), + (.data$covariateValue - .data$median) / .data$iqr, + .data$covariateValue + )) %>% + # smoothly clip the range to [-3, 3] with x / sqrt(1 + (x/3)^2) + # ref: https://arxiv.org/abs/2407.04491 + dplyr::mutate(covariateValue = ifelse(!is.na(.data$iqr) & !is.na(.data$median), + .data$covariateValue / sqrt(1 + (.data$covariateValue / 3)^2), + .data$covariateValue + )) %>% + dplyr::select(-c("median", "iqr")) + done <- TRUE + } else { + ParallelLogger::logInfo("Applying robust normalization of continuous features to test data") + outData <- list( + labels = trainData$labels, + folds = trainData$folds, + covariateData = Andromeda::copyAndromeda(trainData$covariateData) + ) + # apply the normalization to test data by using saved normalization values + outData$covariateData$covariates <- outData$covariateData$covariates %>% + dplyr::left_join(attr(featureEngineeringSettings, "quantiles"), + by = "covariateId", copy = TRUE + ) %>% + dplyr::mutate(covariateValue = ifelse(!is.na(.data$iqr) & !is.na(.data$median), + (.data$covariateValue - .data$median) / .data$iqr, + .data$covariateValue + )) %>% + dplyr::mutate(covariateValue = ifelse(!is.na(.data$iqr) & !is.na(.data$median), + .data$covariateValue / sqrt(1 + (.data$covariateValue / 3)^2), + .data$covariateValue + )) %>% + dplyr::select(-c("median", "iqr")) + } + featureEngineering <- list( + funct = "robustNormalize", + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + done = done + ) + ) + + attr(outData$covariateData, "metaData")$featureEngineering[["robustNormalize"]] <- + featureEngineering + delta <- Sys.time() - start + ParallelLogger::logInfo(paste0( + "Finished robust normalization in ", + signif(delta, 3), " ", attr(delta, "units") + )) + return(outData) +} + +#' Create the settings for removing rare features +#' @param threshold The minimum fraction of the training data that must have a +#' feature for it to be included +#' @return An object of class \code{featureEngineeringSettings} +#' @export +createRareFeatureRemover <- function(threshold = 0.001) { + checkIsClass(threshold, c("numeric")) + checkHigherEqual(threshold, 0) + checkLower(threshold, 1) + featureEngineeringSettings <- list( + threshold = threshold + ) + attr(featureEngineeringSettings, "fun") <- "removeRareFeatures" + + class(featureEngineeringSettings) <- "featureEngineeringSettings" + return(featureEngineeringSettings) +} + +#' A function that removes rare features from the data +#' @details removes features that are present in less than a certain fraction of the population +#' @param trainData The data to be normalized +#' @param featureEngineeringSettings The settings for the normalization +#' @param done Whether to find and remove rare features or remove them only (bool) +#' @return The data with rare features removed +#' @keywords internal +removeRareFeatures <- function(trainData, featureEngineeringSettings, done = FALSE) { + start <- Sys.time() + if (!done) { + ParallelLogger::logInfo( + "Removing features rarer than threshold: ", featureEngineeringSettings$threshold, + " from the data" + ) + outData <- list( + labels = trainData$labels, + folds = trainData$folds, + covariateData = Andromeda::copyAndromeda(trainData$covariateData) + ) + rareFeatures <- outData$covariateData$covariates %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise(count = dplyr::n()) %>% + dplyr::collect() + rareFeatures <- rareFeatures %>% + dplyr::mutate(ratio = .data$count / ( + outData$covariateData$covariates %>% + dplyr::summarise(popSize = dplyr::n_distinct(.data$rowId)) %>% + dplyr::pull() + )) %>% + dplyr::filter(.data$ratio <= featureEngineeringSettings$threshold) %>% + dplyr::pull(c("covariateId")) + + outData$covariateData$covariates <- outData$covariateData$covariates %>% + dplyr::filter(!.data$covariateId %in% rareFeatures) + outData$covariateData$covariateRef <- outData$covariateData$covariateRef %>% + dplyr::filter(!.data$covariateId %in% rareFeatures) + + attr(featureEngineeringSettings, "rareFeatures") <- rareFeatures + + done <- TRUE + } else { + ParallelLogger::logInfo( + "Applying rare feature removal with rate below: ", + featureEngineeringSettings$threshold, " to test data" + ) + outData <- list( + labels = trainData$labels, + folds = trainData$folds, + covariateData = Andromeda::copyAndromeda(trainData$covariateData) + ) + outData$covariateData$covariates <- outData$covariateData$covariates %>% + dplyr::filter( + !.data$covariateId %in% !!attr(featureEngineeringSettings, "rareFeatures") + ) + outData$covariateData$covariateRef <- outData$covariateData$covariateRef %>% + dplyr::filter( + !.data$covariateId %in% !!attr(featureEngineeringSettings, "rareFeatures") + ) + } + featureEngineering <- list( + funct = "removeRareFeatures", + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + done = done + ) + ) + attr(outData$covariateData, "metaData")$featureEngineering[["removeRare"]] <- + featureEngineering + delta <- Sys.time() - start + ParallelLogger::logInfo(paste0( + "Finished rare feature removal in ", + signif(delta, 3), " ", attr(delta, "units") + )) + return(outData) +} diff --git a/R/Imputation.R b/R/Imputation.R new file mode 100644 index 00000000..319cd87c --- /dev/null +++ b/R/Imputation.R @@ -0,0 +1,865 @@ +# @file Imputation.R +# Copyright 2025 Observational Health Data Sciences and Informatics +# +# This file is part of PatientLevelPrediction +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#' @title Create Iterative Imputer settings +#' @description This function creates the settings for an iterative imputer +#' which first removes features with more than `missingThreshold` missing values +#' and then imputes the missing values iteratively using chained equations +#' @param missingThreshold The threshold for missing values to remove a feature +#' @param method The method to use for imputation, currently only "pmm" is supported +#' @param methodSettings A list of settings for the imputation method to use. +#' Currently only "pmm" is supported with the following settings: +#' - k: The number of donors to use for matching +#' - iterations: The number of iterations to use for imputation +#' @return The settings for the single imputer of class `featureEngineeringSettings` +#' @export +createIterativeImputer <- function(missingThreshold = 0.3, + method = "pmm", + methodSettings = list( + pmm = list( + k = 5, + iterations = 5 + ) + )) { + ParallelLogger::logWarn("Imputation is experimental and may not work as expected. + Please report any issues on the GitHub repository.") + checkIsClass(missingThreshold, "numeric") + checkInStringVector(method, c("pmm")) + checkIsClass(methodSettings, "list") + if (method == "pmm") { + checkIsClass(methodSettings$pmm$k, "numeric") + checkHigher(methodSettings$pmm$k, 0) + checkIsClass(methodSettings$pmm$iterations, "numeric") + checkHigher(methodSettings$pmm$iterations, 0) + } + checkHigher(missingThreshold, 0) + checkLower(missingThreshold, 1) + featureEngineeringSettings <- list( + missingThreshold = missingThreshold, + method = method, + methodSettings = methodSettings[[method]] + ) + if (method == "pmm") { + # at the moment this requires glmnet + rlang::check_installed("glmnet") + } + attr(featureEngineeringSettings, "fun") <- "iterativeImpute" + + class(featureEngineeringSettings) <- "featureEngineeringSettings" + return(featureEngineeringSettings) +} + +#' @title Create Simple Imputer settings +#' @description This function creates the settings for a simple imputer +#' which imputes missing values with the mean or median +#' @param method The method to use for imputation, either "mean" or "median" +#' @param missingThreshold The threshold for missing values to be imputed vs removed +#' @return The settings for the single imputer of class `featureEngineeringSettings` +#' @export +createSimpleImputer <- function(method = "mean", + missingThreshold = 0.3) { + ParallelLogger::logWarn("Imputation is experimental and may not work as expected, + please report any issues on the GitHub repository.") + checkIsClass(method, "character") + checkInStringVector(method, c("mean", "median")) + checkIsClass(missingThreshold, "numeric") + checkHigher(missingThreshold, 0) + checkLower(missingThreshold, 1) + featureEngineeringSettings <- list( + method = method, + missingThreshold = missingThreshold + ) + attr(featureEngineeringSettings, "fun") <- "simpleImpute" + + class(featureEngineeringSettings) <- "featureEngineeringSettings" + return(featureEngineeringSettings) +} + +#' @title Simple Imputation +#' @description This function does single imputation with the mean or median +#' @param trainData The data to be imputed +#' @param featureEngineeringSettings The settings for the imputation +#' @param done Whether the imputation has already been done (bool) +#' @return The imputed data +#' @keywords internal +simpleImpute <- function(trainData, featureEngineeringSettings, done = FALSE) { + start <- Sys.time() + if (!done) { + ParallelLogger::logInfo("Imputing missing values with simpleImputer using: ", + featureEngineeringSettings$method, " and missing threshold: ", + featureEngineeringSettings$missingThreshold) + outputData <- list( + labels = trainData$labels, + folds = trainData$foldsa, + covariateData = Andromeda::copyAndromeda(trainData$covariateData) + ) + missingInfo <- extractMissingInfo(outputData) + outputData$covariateData$missingInfo <- missingInfo$missingInfo + continuousFeatures <- missingInfo$continuousFeatures + on.exit(outputData$covariateData$missingInfo <- NULL, add = TRUE) + + outputData$covariateData$covariates <- outputData$covariateData$covariates %>% + dplyr::left_join(outputData$covariateData$missingInfo, by = "covariateId") %>% + dplyr::filter(is.na(.data$missing) || + .data$missing <= featureEngineeringSettings$missingThreshold) %>% + dplyr::select(-"missing") + + # separate the continuous and binary features + featureData <- separateFeatures(outputData, continuousFeatures) + numericData <- featureData[[1]] + on.exit(numericData <- NULL, add = TRUE) + + allRowIds <- trainData$labels$rowId + allColumnIds <- numericData$covariates %>% + dplyr::pull(.data$covariateId) %>% + unique() %>% + sort() + completeIds <- expand.grid(rowId = allRowIds, covariateId = allColumnIds) + numericData$covariates <- merge(completeIds, numericData$covariates, + all.x = TRUE + ) + + if (featureEngineeringSettings$method == "mean") { + numericData$imputedValues <- numericData$covariates %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise(imputedValues = mean(.data$covariateValue, na.rm = TRUE)) + } else if (featureEngineeringSettings$method == "median") { + numericData$imputedValues <- numericData$covariates %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::collect() %>% # median not possible in sql + dplyr::summarise(imputedValues = stats::median(.data$covariateValue, na.rm = TRUE)) + } + + allRowIds <- outputData$labels$rowId + allColumnIds <- numericData$covariates %>% + dplyr::pull(.data$covariateId) %>% + unique() %>% + sort() + completeIds <- expand.grid(rowId = allRowIds, covariateId = allColumnIds) + numericData$covariates <- merge(completeIds, numericData$covariates, + all.x = TRUE + ) + + numericData$imputedCovariates <- numericData$covariates %>% + dplyr::left_join(numericData$imputedValues, by = "covariateId") %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::mutate(imputedValue = ifelse(is.na(.data$covariateValue), + .data$imputedValues, + .data$covariateValue + )) %>% + dplyr::select(-c("imputedValues")) + Andromeda::appendToTable( + outputData$covariateData$covariates, + numericData$imputedCovariates %>% + dplyr::filter(is.na(.data$covariateValue)) %>% + dplyr::mutate(covariateValue = .data$imputedValue) %>% + dplyr::select(-c("imputedValue")) + ) + attr(featureEngineeringSettings, "missingInfo") <- + outputData$covariateData$missingInfo %>% + dplyr::collect() + attr(featureEngineeringSettings, "imputer") <- + numericData$imputedValues %>% dplyr::collect() + done <- TRUE + } else { + ParallelLogger::logInfo("Applying imputation to test data with simpleImputer + using method: ", featureEngineeringSettings$method, " and missing threshold: ", + featureEngineeringSettings$missingThreshold) + outputData <- list( + labels = trainData$labels, + folds = trainData$foldsa, + covariateData = Andromeda::copyAndromeda(trainData$covariateData) + ) + outputData$covariateData$missingInfo <- attr( + featureEngineeringSettings, + "missingInfo" + ) + on.exit(outputData$covariateData$missingInfo <- NULL, add = TRUE) + outputData$covariateData$covariates <- outputData$covariateData$covariates %>% + dplyr::left_join(outputData$covariateData$missingInfo, by = "covariateId") %>% + dplyr::filter(is.na(.data$missing) || + .data$missing <= featureEngineeringSettings$missingThreshold) %>% + dplyr::select(-"missing") + + continuousFeatures <- outputData$covariateData$analysisRef %>% + dplyr::filter(.data$isBinary == "N") %>% + dplyr::select("analysisId") %>% + dplyr::inner_join(outputData$covariateData$covariateRef, by = "analysisId") %>% + dplyr::pull(.data$covariateId) + featureData <- separateFeatures(outputData, continuousFeatures) + numericData <- featureData[[1]] + on.exit(numericData <- NULL, add = TRUE) + # impute missing values + allRowIds <- numericData$covariates %>% + dplyr::pull(.data$rowId) %>% + unique() %>% + sort() + allColumnIds <- numericData$covariates %>% + dplyr::pull(.data$covariateId) %>% + unique() %>% + sort() + completeIds <- expand.grid(rowId = allRowIds, covariateId = allColumnIds) + numericData$covariates <- merge(completeIds, numericData$covariates, + all.x = TRUE + ) + numericData$imputedValues <- attr(featureEngineeringSettings, "imputer") + numericData$imputedCovariates <- numericData$covariates %>% + dplyr::left_join(numericData$imputedValues, by = "covariateId") %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::mutate(imputedValue = ifelse(is.na(.data$covariateValue), + .data$imputedValues, + .data$covariateValue + )) %>% + dplyr::select(-c("imputedValues")) + Andromeda::appendToTable( + outputData$covariateData$covariates, + numericData$imputedCovariates %>% + dplyr::filter(is.na(.data$covariateValue)) %>% + dplyr::mutate(covariateValue = .data$imputedValue) %>% + dplyr::select(-c("imputedValue")) + ) + } + featureEngineering <- list( + funct = "simpleImpute", + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + done = done + ) + ) + attr(outputData$covariateData, "metaData")$featureEngineering[["simpleImputer"]] <- + featureEngineering + delta <- Sys.time() - start + ParallelLogger::logInfo("Imputation done in time: ", signif(delta, 3), " ", + attr(delta, "units")) + return(outputData) +} + + +#' @title Imputation +#' @description This function does single imputation with predictive mean matchin +#' @param trainData The data to be imputed +#' @param featureEngineeringSettings The settings for the imputation +#' @param done Whether the imputation has already been done (bool) +#' @return The imputed data +#' @keywords internal +iterativeImpute <- function(trainData, featureEngineeringSettings, done = FALSE) { + start <- Sys.time() + if (!done) { + ParallelLogger::logInfo("Imputing missing values with iterativeImputer using: ", + featureEngineeringSettings$method, ", missing threshold: ", + featureEngineeringSettings$missingThreshold, " and method settings: ", + featureEngineeringSettings$methodSettings) + outputData <- list( + labels = trainData$labels, + folds = trainData$folds, + covariateData = Andromeda::copyAndromeda(trainData$covariateData) + ) + missingInfo <- extractMissingInfo(outputData) + outputData$covariateData$missingInfo <- missingInfo$missingInfo + continuousFeatures <- missingInfo$continuousFeatures + on.exit(outputData$covariateData$missingInfo <- NULL, add = TRUE) + + outputData$covariateData$covariates <- outputData$covariateData$covariates %>% + dplyr::left_join(outputData$covariateData$missingInfo, by = "covariateId") %>% + dplyr::filter(is.na(.data$missing) || + .data$missing <= featureEngineeringSettings$missingThreshold) %>% + dplyr::select(-"missing") + + # separate the continuous and binary features + featureData <- separateFeatures(outputData, continuousFeatures) + numericData <- featureData[[1]] + binary <- featureData[[2]] + on.exit(numericData <- NULL, add = TRUE) + on.exit(binary <- NULL, add = TRUE) + + numericData <- initializeImputation(numericData, "mean", + labels = outputData$labels + ) + # add imputed values in data + iterativeImputeResults <- iterativeChainedImpute(numericData, + binary, + outputData, + featureEngineeringSettings, + direction = "ascending", + iterations = featureEngineeringSettings$methodSettings$iterations + ) + + Andromeda::appendToTable( + outputData$covariateData$covariates, + iterativeImputeResults$numericData$imputedCovariates %>% + dplyr::filter(is.na(.data$covariateValue)) %>% + dplyr::mutate(covariateValue = .data$imputedValue) %>% + dplyr::select(-c("imputedValue")) + ) + + + attr(featureEngineeringSettings, "missingInfo") <- + outputData$covariateData$missingInfo %>% + dplyr::collect() + attr(featureEngineeringSettings, "imputer") <- iterativeImputeResults$modelInfo + attr(featureEngineeringSettings, "kdeEstimates") <- iterativeImputeResults$kdeEstimates + done <- TRUE + } else { + ParallelLogger::logInfo("Applying imputation to test data with iterativeImputer + using method: ", featureEngineeringSettings$method, " and missing threshold: ", + featureEngineeringSettings$missingThreshold) + outputData <- list( + labels = trainData$labels, + folds = trainData$folds, + covariateData = Andromeda::copyAndromeda(trainData$covariateData) + ) + # remove data with more than missingThreshold + outputData$covariateData$missingInfo <- attr( + featureEngineeringSettings, + "missingInfo" + ) + on.exit(outputData$covariateData$missingInfo <- NULL, add = TRUE) + outputData$covariateData$covariateIsBinary <- outputData$covariateData$covariateRef %>% + dplyr::select("covariateId", "analysisId") %>% + dplyr::inner_join( + outputData$covariateData$analysisRef %>% + dplyr::select("analysisId", "isBinary"), + by = "analysisId" + ) %>% + dplyr::mutate(isBinary = .data$isBinary == "Y") %>% + dplyr::select("covariateId", "isBinary") %>% + dplyr::compute() + on.exit(outputData$covariateData$covariateIsBinary <- NULL, add = TRUE) + outputData$covariateData$covariates <- outputData$covariateData$covariates %>% + dplyr::left_join(outputData$covariateData$missingInfo, by = "covariateId") %>% + dplyr::left_join(outputData$covariateData$covariateIsBinary, by = "covariateId") %>% + dplyr::filter( + (!is.na(.data$missing) && .data$missing <= featureEngineeringSettings$missingThreshold) || + (is.na(.data$missing) && .data$isBinary) + ) %>% + dplyr::select(-"missing", -"isBinary") + + continuousFeatures <- outputData$covariateData$analysisRef %>% + dplyr::filter(.data$isBinary == "N") %>% + dplyr::select("analysisId") %>% + dplyr::inner_join(outputData$covariateData$covariateRef, by = "analysisId") %>% + dplyr::pull(.data$covariateId) + featureData <- separateFeatures(outputData, continuousFeatures) + numericData <- featureData[[1]] + binary <- featureData[[2]] + on.exit(numericData <- NULL, add = TRUE) + on.exit(binary <- NULL, add = TRUE) + # impute missing values + allRowIds <- outputData$labels$rowId + allColumnIds <- numericData$covariates %>% + dplyr::pull(.data$covariateId) %>% + unique() %>% + sort() + completeIds <- expand.grid(rowId = allRowIds, covariateId = allColumnIds) + # now we have NAs for missing combinations + numericData$covariates <- merge(completeIds, numericData$covariates, + all.x = TRUE + ) + + # get index of NAs for every feature to be imputed + numericData$missingIndex <- numericData$covariates %>% + dplyr::filter(is.na(.data$covariateValue)) %>% + dplyr::select(-c("covariateValue")) + on.exit(numericData$missingIndex <- NULL, add = TRUE) + + numericData$imputedCovariates <- numericData$covariates %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::mutate(imputedValue = .data$covariateValue) + on.exit(numericData$imputedCovariates <- NULL, add = TRUE) + + + varsToImpute <- numericData$missingIndex %>% + dplyr::pull(.data$covariateId) %>% + unique() + for (varId in varsToImpute) { + varName <- outputData$covariateData$covariateRef %>% + dplyr::filter(.data$covariateId == varId) %>% + dplyr::pull(.data$covariateName) + ParallelLogger::logInfo("Imputing variable: ", varName) + numericData$y <- numericData$covariates %>% + dplyr::filter(.data$covariateId == varId) %>% + dplyr::mutate(y = .data$covariateValue) %>% + dplyr::select("y", "rowId") + on.exit(numericData$y <- NULL, add = TRUE) + missIdx <- which(is.na(numericData$y %>% dplyr::pull(.data$y))) + numericData$X <- numericData$covariates %>% + dplyr::filter(.data$covariateId != varId) + on.exit(numericData$X <- NULL, add = TRUE) + Andromeda::appendToTable(numericData$X, binary$covariates) + numericData$xMiss <- numericData$X %>% + dplyr::filter(.data$rowId %in% !!allRowIds[missIdx]) + on.exit(numericData$xMiss <- NULL, add = TRUE) + + imputer <- + attr(featureEngineeringSettings, "imputer")[[as.character(varId)]] + pmmResults <- pmmPredict(numericData, k = 5, imputer) + + # update imputations in data + numericData$imputedValues <- pmmResults$imputedValues + on.exit(numericData$imputedValues <- NULL, add = TRUE) + numericData$imputedCovariates <- numericData$imputedCovariates %>% + dplyr::left_join(numericData$imputedValues, + by = "rowId", + suffix = c("", ".new") + ) %>% + dplyr::mutate( + imputedValue = + dplyr::if_else(.data$covariateId == varId && + !is.na(.data$imputedValue.new), + .data$imputedValue.new, + .data$imputedValue + ) + ) %>% + dplyr::select(-"imputedValue.new") + } + # add imputed values in data + Andromeda::appendToTable( + outputData$covariateData$covariates, + numericData$imputedCovariates %>% + dplyr::filter(is.na(.data$covariateValue)) %>% + dplyr::mutate(covariateValue = .data$imputedValue) %>% + dplyr::select(-c("imputedValue")) + ) + } + featureEngineering <- list( + funct = "iterativeImpute", + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + done = done + ) + ) + attr(outputData$covariateData, "metaData")$featureEngineering[["iterativeImputer"]] <- + featureEngineering + delta <- Sys.time() - start + ParallelLogger::logInfo("Imputation done in time: ", signif(delta, 3), " ", + attr(delta, "units")) + return(outputData) +} + +#' @title Predictive mean matching using lasso +#' @param data An andromeda object with the following fields: +#' xObs: covariates table for observed data +#' xMiss: covariates table for missing data +#' yObs: outcome variable that we want to impute +#' @param k The number of donors to use for matching (default 5) +#' @keywords internal +pmmFit <- function(data, k = 5) { + rlang::check_installed("glmnet") + data$rowMap <- data$xObs %>% + dplyr::group_by(.data$rowId) %>% + dplyr::summarise() %>% + dplyr::mutate( + oldRowId = .data$rowId, + newRowId = dplyr::row_number() + ) %>% + dplyr::select(c("newRowId", "oldRowId")) %>% + dplyr::compute() + on.exit(data$rowMap <- NULL, add = TRUE) + data$colMap <- data$xObs %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise() %>% + dplyr::mutate( + oldCovariateId = .data$covariateId, + newCovariateId = dplyr::row_number() + ) %>% + dplyr::select(c("newCovariateId", "oldCovariateId")) + on.exit(data$colMap <- NULL, add = TRUE) + + data$xObs <- data$xObs %>% + dplyr::left_join(data$rowMap, by = c("rowId" = "oldRowId")) %>% + dplyr::left_join(data$colMap, by = c("covariateId" = "oldCovariateId")) %>% + dplyr::select( + rowId = "newRowId", + covariateId = "newCovariateId", + covariateValue = "covariateValue" + ) + + xObs <- Matrix::sparseMatrix( + i = data$xObs %>% dplyr::pull(.data$rowId), + j = data$xObs %>% dplyr::pull(.data$covariateId), + x = data$xObs %>% dplyr::pull(.data$covariateValue), + dims = c( + data$rowMap %>% dplyr::pull(.data$newRowId) %>% max(), + data$colMap %>% dplyr::pull(.data$newCovariateId) %>% max() + ) + ) + + fit <- glmnet::cv.glmnet(xObs, data$yObs %>% + dplyr::pull(.data$y), alpha = 1, nfolds = 3) + + # predict on both XObs and XMiss + predsObs <- stats::predict(fit, xObs, fit$lambda.min) + data$xMiss <- data$xMiss %>% + dplyr::left_join( + data$xMiss %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise( + n_unique = dplyr::n_distinct(.data$covariateValue), + max = max(.data$covariateValue, na.rm = TRUE), + min = min(.data$covariateValue, na.rm = TRUE), + ), + by = "covariateId" + ) %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::mutate( + covariateValue = ifelse(.data$n_unique > 2 & (.data$max - .data$max) > 0, + (.data$covariateValue - .data$min) / (.data$max - .data$min), + .data$covariateValue + ) + ) %>% + dplyr::ungroup() %>% + dplyr::select(-c("n_unique", "min", "max")) + data$rowMapMiss <- data$xMiss %>% + dplyr::group_by(.data$rowId) %>% + dplyr::summarise() %>% + dplyr::mutate( + oldRowId = .data$rowId, + newRowId = dplyr::row_number() + ) %>% + dplyr::select(c("newRowId", "oldRowId")) %>% + dplyr::compute() + on.exit(data$rowMapMiss <- NULL, add = TRUE) + data$xMiss <- data$xMiss %>% + dplyr::left_join(data$rowMapMiss, by = c("rowId" = "oldRowId")) %>% + dplyr::left_join(data$colMap, by = c("covariateId" = "oldCovariateId")) %>% + dplyr::select( + rowId = "newRowId", + covariateId = "newCovariateId", + covariateValue = "covariateValue" + ) + + xMiss <- Matrix::sparseMatrix( + i = data$xMiss %>% dplyr::pull(.data$rowId), + j = data$xMiss %>% dplyr::pull(.data$covariateId), + x = data$xMiss %>% dplyr::pull(.data$covariateValue), + dims = c( + data$xMiss %>% dplyr::pull(.data$rowId) %>% max(), + data$xMiss %>% dplyr::pull(.data$covariateId) %>% max() + ) + ) + + predsMiss <- stats::predict(fit, xMiss, fit$lambda.min) + + # precompute mapping to use - straight from xId (row index) to + # covariateValue of donor + donorMapping <- data$rowMap %>% + dplyr::inner_join(data$yObs, by = c("oldRowId" = "rowId"), copy = TRUE) %>% + dplyr::pull(.data$y) + # for each missing value, find the k closest observed values + imputedValues <- numeric(nrow(xMiss)) + for (j in 1:nrow(xMiss)) { + distances <- abs(predsObs - predsMiss[j]) + donorIndices <- order(distances)[1:k] + donorValues <- donorMapping[donorIndices] + imputedValues[j] <- sample(donorValues, 1) + } + + results <- list() + results$imputedValues <- data.frame( + rowId = data$rowMapMiss %>% + dplyr::pull(.data$oldRowId), + imputedValue = imputedValues + ) + bestIndex <- which(fit$lambda == fit$lambda.min) + nonZero <- which(fit$glmnet.fit$beta[, bestIndex] != 0) + nonZeroCovariateIds <- data$colMap %>% + dplyr::filter(.data$newCovariateId %in% nonZero) %>% + dplyr::pull(.data$oldCovariateId) + if (length(nonZero) == 0) { + ParallelLogger::logWarn("Imputation model only has intercept. It does not fit the data well") + } + results$model <- list( + intercept = as.numeric(fit$glmnet.fit$a0[bestIndex]), + coefficients = data.frame( + covariateId = nonZeroCovariateIds, + values = as.numeric(fit$glmnet.fit$beta[nonZero, bestIndex]) + ), + predictions = data.frame( + rowId = data$rowMap %>% + dplyr::pull(.data$oldRowId), + prediction = as.numeric(predsObs) + ) + ) + return(results) +} + +pmmPredict <- function(data, k = 5, imputer) { + data$coefficients <- imputer$coefficients + predictionMissing <- data$xMiss %>% + dplyr::inner_join(data$coefficients, by = "covariateId") %>% + dplyr::mutate(values = .data$covariateValue * .data$values) %>% + dplyr::group_by(.data$rowId) %>% + dplyr::summarise(value = sum(.data$values, na.rm = TRUE)) %>% + dplyr::select("rowId", "value") + predictionMissing <- as.data.frame(predictionMissing) + if (length(predictionMissing$value) == 0) { + # prediction model for imputing only has intercept + ParallelLogger::logWarn("Imputation model only has intercept, + imputing with intercept. Something went wrong during fitting probably.") + predictionMissing <- data.frame( + rowId = data$xMiss %>% + dplyr::pull(.data$rowId) %>% + unique(), + value = imputer$intercept + ) + } else { + predictionMissing$value <- predictionMissing$value + imputer$intercept + } + + # precompute mapping to use - straight from xId (row index) to + # covariateValue of donor + donorMapping <- imputer$predictions %>% dplyr::pull(.data$prediction) + + # for each missing value, find the k closest observed values + nRows <- data$xMiss %>% + dplyr::pull(.data$rowId) %>% + dplyr::n_distinct() + imputedValues <- numeric(nRows) + predsObs <- imputer$predictions$prediction + for (j in 1:nRows) { + distances <- abs(predsObs - predictionMissing$value[j]) + donorIndices <- order(distances)[1:k] + donorValues <- donorMapping[donorIndices] + imputedValues[j] <- sample(donorValues, 1) + } + + results <- list() + results$imputedValues <- data.frame( + rowId = predictionMissing %>% + dplyr::pull(.data$rowId), + imputedValue = imputedValues + ) + return(results) +} + +extractMissingInfo <- function(trainData) { + ParallelLogger::logInfo("Calculating missingness in data") + total <- trainData$covariateData$covariates %>% + dplyr::summarise(total = dplyr::n_distinct(.data$rowId)) %>% + dplyr::pull() + continuousFeatures <- trainData$covariateData$analysisRef %>% + dplyr::filter(.data$isBinary == "N") %>% + dplyr::select("analysisId") %>% + dplyr::inner_join(trainData$covariateData$covariateRef, by = "analysisId") %>% + dplyr::pull(.data$covariateId) + + missingInfo <- trainData$covariateData$covariates %>% + dplyr::filter(.data$covariateId %in% continuousFeatures) %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise(counts = dplyr::n()) %>% + dplyr::collect() %>% # necessary because of integer division in sqlite + dplyr::mutate(missing = 1 - .data$counts / total) %>% + dplyr::select(c("covariateId", "missing")) + results <- list( + "missingInfo" = missingInfo, + "continuousFeatures" = continuousFeatures + ) + ParallelLogger::logInfo("Found ", nrow(missingInfo), " features with missing values") + + return(results) +} + +separateFeatures <- function(trainData, continuousFeatures) { + numericData <- Andromeda::andromeda() + numericData$covariates <- trainData$covariateData$covariates %>% + dplyr::filter(.data$covariateId %in% continuousFeatures) + numericData$covariateRef <- trainData$covariateData$covariateRef %>% + dplyr::filter(.data$covariateId %in% continuousFeatures) + + binaryData <- Andromeda::andromeda() + binaryData$covariates <- trainData$covariateData$covariates %>% + dplyr::filter(!.data$covariateId %in% !!continuousFeatures) + binaryData$covariateRef <- trainData$covariateData$covariateRef %>% + dplyr::filter(!.data$covariateId %in% !!continuousFeatures) + return(list(numericData, binaryData)) +} + +initializeImputation <- function(numericData, method = "mean", labels) { + allRowIds <- labels$rowId + allColumnIds <- numericData$covariates %>% + dplyr::pull(.data$covariateId) %>% + unique() %>% + sort() + completeIds <- expand.grid(rowId = allRowIds, covariateId = allColumnIds) + numericData$covariates <- merge(completeIds, numericData$covariates, + all.x = TRUE + ) + + # get index of NAs for every feature to be imputed + numericData$missingIndex <- numericData$covariates %>% + dplyr::filter(is.na(.data$covariateValue)) %>% + dplyr::select(-c("covariateValue")) + + if (method == "mean") { + numericData$imputedCovariates <- numericData$covariates %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::mutate(imputedValue = ifelse(is.na(.data$covariateValue), + mean(.data$covariateValue, na.rm = TRUE), + .data$covariateValue + )) + } else { + stop("Unknown initialization method: ", method) + } + return(numericData) +} + +# Main (M)ICE algorithm - iterative imputation with chained equations +iterativeChainedImpute <- function(numericData, + binaryData, + originalData, + featureEngineeringSettings, + direction = "ascending", + iterations = 5) { + prevImputations <- list() + allRowIds <- numericData$covariates %>% + dplyr::pull(.data$rowId) %>% + unique() %>% + sort() + maxIter <- iterations # TODO check + varsToImpute <- numericData$missingIndex %>% + dplyr::pull(.data$covariateId) %>% + unique() + convergenceParameters <- list() + modelInfo <- list() + + for (iter in 1:maxIter) { + ParallelLogger::logInfo("Imputation iteration: ", iter) + currentImputations <- list() + + # TODO do in order from least missing to most missing + for (varId in varsToImpute) { + varName <- originalData$covariateData$covariateRef %>% + dplyr::filter(.data$covariateId == varId) %>% + dplyr::pull(.data$covariateName) + ParallelLogger::logInfo("Imputing variable: ", varName) + numericData$y <- numericData$covariates %>% + dplyr::filter(.data$covariateId == varId) %>% + dplyr::mutate(y = .data$covariateValue) %>% + dplyr::select("y", "rowId") + on.exit(numericData$y <- NULL, add = TRUE) + obsIdx <- which(!is.na(numericData$y %>% dplyr::pull(.data$y))) + missIdx <- which(is.na(numericData$y %>% dplyr::pull(.data$y))) + numericData$yObs <- numericData$y %>% + dplyr::filter(.data$rowId %in% !!allRowIds[obsIdx]) + on.exit(numericData$yObs <- NULL, add = TRUE) + + numericData$X <- numericData$imputedCovariates %>% + dplyr::filter(.data$covariateId != varId) %>% + dplyr::mutate(covariateValue = .data$imputedValue) %>% + dplyr::select(-c("imputedValue")) + on.exit(numericData$X <- NULL, add = TRUE) + Andromeda::appendToTable(numericData$X, binaryData$covariates) + numericData$xObs <- numericData$X %>% dplyr::filter(.data$rowId %in% !!allRowIds[obsIdx]) + on.exit(numericData$xObs <- NULL, add = TRUE) + numericData$xMiss <- numericData$X %>% dplyr::filter(.data$rowId %in% !!allRowIds[missIdx]) + on.exit(numericData$xMiss <- NULL, add = TRUE) + + pmmResults <- pmmFit(numericData, k = featureEngineeringSettings$methodSettings$k) + + # update imputations in data + numericData$imputedValues <- pmmResults$imputedValues + on.exit(numericData$imputedValues <- NULL, add = TRUE) + numericData$imputedCovariates <- numericData$imputedCovariates %>% + dplyr::left_join(numericData$imputedValues, + by = "rowId", + suffix = c("", ".new") + ) %>% + dplyr::mutate( + imputedValue = + dplyr::if_else(.data$covariateId == varId && + !is.na(.data$imputedValue.new), + .data$imputedValue.new, + .data$imputedValue + ) + ) %>% + dplyr::select(-"imputedValue.new") + + # store current imputations for convergence check + currentImputations[[as.character(varId)]] <- pmmResults$imputedValues$imputedValue + + # store pmm info for each variable + modelInfo[[as.character(varId)]] <- pmmResults$model + } + + # save values for convergence checking afterwards + # store mean and variance of imputed values for each variable + # as well as average change from previous iteration + meanVector <- numeric(length(varsToImpute)) + varVector <- numeric(length(varsToImpute)) + idx <- 1 + for (varId in varsToImpute) { + currentImputation <- currentImputations[[as.character(varId)]] + meanVector[idx] <- mean(currentImputation) + varVector[idx] <- stats::var(currentImputation) + idx <- idx + 1 + } + convergenceInfo <- list( + meanVector = meanVector, + varVector = varVector + ) + if (iter > 1) { + meanVarChange <- numeric(length(varsToImpute)) + for (varId in varsToImpute) { + prevImputation <- prevImputations[[as.character(varId)]] + currentImputation <- currentImputations[[as.character(varId)]] + meanVarChange <- c( + meanVarChange, + mean(abs(currentImputation - prevImputation)) + ) + } + convergenceInfo$meanVarChange <- meanVarChange + } + convergenceParameters[[iter]] <- convergenceInfo + + prevImputations <- currentImputations + } + + # calculate kde estimates of imputed and observed distributions per imputed variable + # and store in featureEngineeringSettings + kdeEstimates <- list() + for (varId in varsToImpute) { + varName <- originalData$covariateData$covariateRef %>% + dplyr::filter(.data$covariateId == varId) %>% + dplyr::pull(.data$covariateName) + rows <- numericData$missingIndex %>% + dplyr::filter(.data$covariateId == varId) %>% + dplyr::pull(.data$rowId) + imputedValues <- numericData$imputedCovariates %>% + dplyr::filter( + .data$covariateId == varId, + .data$rowId %in% rows + ) %>% + dplyr::pull(.data$imputedValue) + observedValues <- numericData$covariates %>% + dplyr::filter( + .data$covariateId == varId, + !is.na(.data$covariateValue) + ) %>% + dplyr::pull(.data$covariateValue) + kdeEstimates[[as.character(varId)]] <- list( + imputed = stats::density(imputedValues), + observed = stats::density(observedValues) + ) + } + results <- list( + "numericData" = numericData, + "convergenceParameters" = convergenceParameters, + "modelInfo" = modelInfo, + "kdeEstimates" = kdeEstimates + ) + return(results) +} diff --git a/R/ParamChecks.R b/R/ParamChecks.R index be424c42..3a2d1c63 100644 --- a/R/ParamChecks.R +++ b/R/ParamChecks.R @@ -84,7 +84,7 @@ checkIsClass <- function(parameter, classes) { checkInStringVector <- function(parameter, values) { name <- deparse(substitute(parameter)) if (!parameter %in% values) { - ParallelLogger::logError(paste0(name, " should be ", paste0(as.character(values), collapse = "or "))) + ParallelLogger::logError(paste0(name, " should be ", paste0(as.character(values), collapse = " or "))) stop(paste0(name, " has incorrect value")) } return(TRUE) diff --git a/R/PreprocessingData.R b/R/PreprocessingData.R index e4c6e01e..c46067a0 100644 --- a/R/PreprocessingData.R +++ b/R/PreprocessingData.R @@ -47,7 +47,6 @@ createPreprocessSettings <- function( return(preprocessingSettings) } - #' A function that wraps around FeatureExtraction::tidyCovariateData to normalise the data #' and remove rare or redundant features #' @@ -57,7 +56,8 @@ createPreprocessSettings <- function( #' @param covariateData The covariate part of the training data created by \code{splitData} after being sampled and having #' any required feature engineering #' @param preprocessSettings The settings for the preprocessing created by \code{createPreprocessSettings} -#' @return +#' @return the processed data +#' @keywords internal #' The data processed preprocessData <- function(covariateData, preprocessSettings) { diff --git a/R/RunPlp.R b/R/RunPlp.R index 4d6383f4..d5572b2d 100644 --- a/R/RunPlp.R +++ b/R/RunPlp.R @@ -213,7 +213,6 @@ runPlp <- function( executeSettings = createDefaultExecuteSettings(), saveDirectory = getwd() ) { - # start log analysisPath <- file.path(saveDirectory, analysisId) logSettings$saveDirectory <- analysisPath @@ -536,7 +535,6 @@ runPlp <- function( class(results) <- c("runPlp") ParallelLogger::logInfo("Run finished successfully.") - # save the results ParallelLogger::logInfo(paste0("Saving PlpResult")) tryCatch(savePlpResult(results, file.path(analysisPath, "plpResult")), diff --git a/man/createIterativeImputer.Rd b/man/createIterativeImputer.Rd new file mode 100644 index 00000000..b30aad49 --- /dev/null +++ b/man/createIterativeImputer.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Imputation.R +\name{createIterativeImputer} +\alias{createIterativeImputer} +\title{Create Iterative Imputer settings} +\usage{ +createIterativeImputer( + missingThreshold = 0.3, + method = "pmm", + methodSettings = list(pmm = list(k = 5, iterations = 5)) +) +} +\arguments{ +\item{missingThreshold}{The threshold for missing values to remove a feature} + +\item{method}{The method to use for imputation, currently only "pmm" is supported} + +\item{methodSettings}{A list of settings for the imputation method to use. +Currently only "pmm" is supported with the following settings: +- k: The number of donors to use for matching +- iterations: The number of iterations to use for imputation} +} +\value{ +The settings for the single imputer of class `featureEngineeringSettings` +} +\description{ +This function creates the settings for an iterative imputer +which first removes features with more than `missingThreshold` missing values +and then imputes the missing values iteratively using chained equations +} diff --git a/man/createNormalizer.Rd b/man/createNormalizer.Rd new file mode 100644 index 00000000..3f7276be --- /dev/null +++ b/man/createNormalizer.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FeatureEngineering.R +\name{createNormalizer} +\alias{createNormalizer} +\title{Create the settings for normalizing the data @param type The type of normalization to use, either "minmax" or "robust"} +\usage{ +createNormalizer(type = "minmax") +} +\arguments{ +\item{type}{The type of normalization to use, either "minmax" or "robust"} +} +\value{ +An object of class \code{featureEngineeringSettings} + +An object of class \code{featureEngineeringSettings}' +} +\description{ +Create the settings for normalizing the data @param type The type of normalization to use, either "minmax" or "robust" +} diff --git a/man/createRareFeatureRemover.Rd b/man/createRareFeatureRemover.Rd new file mode 100644 index 00000000..99ac4ee5 --- /dev/null +++ b/man/createRareFeatureRemover.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FeatureEngineering.R +\name{createRareFeatureRemover} +\alias{createRareFeatureRemover} +\title{Create the settings for removing rare features} +\usage{ +createRareFeatureRemover(threshold = 0.001) +} +\arguments{ +\item{threshold}{The minimum fraction of the training data that must have a +feature for it to be included} +} +\value{ +An object of class \code{featureEngineeringSettings} +} +\description{ +Create the settings for removing rare features +} diff --git a/man/createSimpleImputer.Rd b/man/createSimpleImputer.Rd new file mode 100644 index 00000000..f49fbb14 --- /dev/null +++ b/man/createSimpleImputer.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Imputation.R +\name{createSimpleImputer} +\alias{createSimpleImputer} +\title{Create Simple Imputer settings} +\usage{ +createSimpleImputer(method = "mean", missingThreshold = 0.3) +} +\arguments{ +\item{method}{The method to use for imputation, either "mean" or "median"} + +\item{missingThreshold}{The threshold for missing values to be imputed vs removed} +} +\value{ +The settings for the single imputer of class `featureEngineeringSettings` +} +\description{ +This function creates the settings for a simple imputer +which imputes missing values with the mean or median +} diff --git a/man/iterativeImpute.Rd b/man/iterativeImpute.Rd new file mode 100644 index 00000000..3bff2d34 --- /dev/null +++ b/man/iterativeImpute.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Imputation.R +\name{iterativeImpute} +\alias{iterativeImpute} +\title{Imputation} +\usage{ +iterativeImpute(trainData, featureEngineeringSettings, done = FALSE) +} +\arguments{ +\item{trainData}{The data to be imputed} + +\item{featureEngineeringSettings}{The settings for the imputation} + +\item{done}{Whether the imputation has already been done (bool)} +} +\value{ +The imputed data +} +\description{ +This function does single imputation with predictive mean matchin +} +\keyword{internal} diff --git a/man/minMaxNormalize.Rd b/man/minMaxNormalize.Rd new file mode 100644 index 00000000..7747b731 --- /dev/null +++ b/man/minMaxNormalize.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FeatureEngineering.R +\name{minMaxNormalize} +\alias{minMaxNormalize} +\title{A function that normalizes continous features to have values between 0 and 1} +\usage{ +minMaxNormalize(trainData, featureEngineeringSettings, done = FALSE) +} +\arguments{ +\item{trainData}{The training data to be normalized} + +\item{featureEngineeringSettings}{The settings for the normalization} + +\item{done}{Whether the data has already been normalized (bool)} +} +\value{ +The normalized data +} +\description{ +A function that normalizes continous features to have values between 0 and 1 +} +\details{ +uses value - min / (max - min) to normalize the data +} +\keyword{internal} diff --git a/man/pmmFit.Rd b/man/pmmFit.Rd new file mode 100644 index 00000000..ee7d35f5 --- /dev/null +++ b/man/pmmFit.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Imputation.R +\name{pmmFit} +\alias{pmmFit} +\title{Predictive mean matching using lasso} +\usage{ +pmmFit(data, k = 5) +} +\arguments{ +\item{data}{An andromeda object with the following fields: +xObs: covariates table for observed data +xMiss: covariates table for missing data +yObs: outcome variable that we want to impute} + +\item{k}{The number of donors to use for matching (default 5)} +} +\description{ +Predictive mean matching using lasso +} +\keyword{internal} diff --git a/man/preprocessData.Rd b/man/preprocessData.Rd index 4f9b18eb..f7c3112c 100644 --- a/man/preprocessData.Rd +++ b/man/preprocessData.Rd @@ -14,7 +14,7 @@ any required feature engineering} \item{preprocessSettings}{The settings for the preprocessing created by \code{createPreprocessSettings}} } \value{ -The data processed +the processed data } \description{ A function that wraps around FeatureExtraction::tidyCovariateData to normalise the data @@ -23,3 +23,7 @@ and remove rare or redundant features \details{ Returns an object of class \code{covariateData} that has been processed } +\keyword{The} +\keyword{data} +\keyword{internal} +\keyword{processed} diff --git a/man/removeRareFeatures.Rd b/man/removeRareFeatures.Rd new file mode 100644 index 00000000..6a5e73d9 --- /dev/null +++ b/man/removeRareFeatures.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FeatureEngineering.R +\name{removeRareFeatures} +\alias{removeRareFeatures} +\title{A function that removes rare features from the data} +\usage{ +removeRareFeatures(trainData, featureEngineeringSettings, done = FALSE) +} +\arguments{ +\item{trainData}{The data to be normalized} + +\item{featureEngineeringSettings}{The settings for the normalization} + +\item{done}{Whether to find and remove rare features or remove them only (bool)} +} +\value{ +The data with rare features removed +} +\description{ +A function that removes rare features from the data +} +\details{ +removes features that are present in less than a certain fraction of the population +} +\keyword{internal} diff --git a/man/robustNormalize.Rd b/man/robustNormalize.Rd new file mode 100644 index 00000000..f1ccd296 --- /dev/null +++ b/man/robustNormalize.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FeatureEngineering.R +\name{robustNormalize} +\alias{robustNormalize} +\title{A function that normalizes continous by the interquartile range and forces +the resulting values to be between -3 and 3 with f(x) = x / sqrt(1 + (x/3)^2)} +\usage{ +robustNormalize(trainData, featureEngineeringSettings, done = FALSE) +} +\arguments{ +\item{trainData}{The training data to be normalized} + +\item{featureEngineeringSettings}{The settings for the normalization} + +\item{done}{Whether the data has already been normalized (bool)} +} +\value{ +The normalized data +} +\description{ +A function that normalizes continous by the interquartile range and forces +the resulting values to be between -3 and 3 with f(x) = x / sqrt(1 + (x/3)^2) +} +\details{ +uses (value - median) / iqr to normalize the data and then +applies the function f(x) = x / sqrt(1 + (x/3)^2) to the normalized values. +This forces the values to be between -3 and 3 while preserving the relative +ordering of the values. +based on https://arxiv.org/abs/2407.04491 for more details +} +\keyword{internal} diff --git a/man/simpleImpute.Rd b/man/simpleImpute.Rd new file mode 100644 index 00000000..89be911e --- /dev/null +++ b/man/simpleImpute.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Imputation.R +\name{simpleImpute} +\alias{simpleImpute} +\title{Simple Imputation} +\usage{ +simpleImpute(trainData, featureEngineeringSettings, done = FALSE) +} +\arguments{ +\item{trainData}{The data to be imputed} + +\item{featureEngineeringSettings}{The settings for the imputation} + +\item{done}{Whether the imputation has already been done (bool)} +} +\value{ +The imputed data +} +\description{ +This function does single imputation with the mean or median +} +\keyword{internal} diff --git a/tests/testthat/test-featureEngineering.R b/tests/testthat/test-featureEngineering.R index c283b2af..5570477c 100644 --- a/tests/testthat/test-featureEngineering.R +++ b/tests/testthat/test-featureEngineering.R @@ -20,7 +20,6 @@ context("FeatureEngineering") testFEFun <- function(type = "none") { result <- createFeatureEngineeringSettings(type = type) - return(result) } @@ -298,3 +297,124 @@ test_that("createStratifiedImputationSettings correct class", { numSubjects ) }) + +test_that("createNormalizer works", { + normalizer <- createNormalizer(type = "minmax") + expect_equal(normalizer$type, "minmax") + expect_equal(attr(normalizer, "fun"), "minMaxNormalize") + expect_s3_class(normalizer, "featureEngineeringSettings") + + normalizer <- createNormalizer(type = "robust") + expect_equal(normalizer$type, "robust") + + expect_error(createNormalizer(type = "mean")) + expect_error(createNormalizer(type = "median")) + expect_error(createNormalizer(type = "zscore")) + expect_error(createNormalizer(type = "none")) +}) + +test_that("normalization works", { + normalizer <- createNormalizer(type = "minmax") + addFeature <- function(data, covariateId, minValue, maxValue) { + data$covariateData <- Andromeda::copyAndromeda(data$covariateData) + nSubjects <- nrow(data$labels) + Andromeda::appendToTable( + data$covariateData$covariates, + data.frame( + rowId = data$labels$rowId, + covariateId = rep(covariateId, nSubjects), + covariateValue = runif(nSubjects, minValue, maxValue) + ) + ) + Andromeda::appendToTable( + data$covariateData$covariateRef, + data.frame( + covariateId = covariateId, + covariateName = "testCovariate", + analysisId = 101, + conceptId = 1 + ) + ) + Andromeda::appendToTable( + data$covariateData$analysisRef, + data.frame( + analysisId = 101, + analysisName = "testAnalysis", + domainId = "testDomain", + startDay = 0, + endDay = 0, + isBinary = "N", + missingMeansZero = "N" + ) + ) + return(data) + } + data <- addFeature(tinyTrainData, 12101, -10, 10) + normalizedData <- minMaxNormalize(data, normalizer) + + testNormData <- addFeature(testData, 12101, -10, 10) + metaData <- attr(normalizedData$covariateData, "metaData") + testSettings <- metaData$featureEngineering$minMaxNormalize$settings$featureEngineeringSettings + testNormalizedData <- minMaxNormalize(testNormData, testSettings, done = TRUE) + + feature <- normalizedData$covariateData$covariates %>% + dplyr::filter(.data$covariateId == 12101) %>% + dplyr::pull(.data$covariateValue) + expect_true(all(feature >= 0) && all(feature <= 1)) + testFeature <- testNormalizedData$covariateData$covariates %>% + dplyr::filter(.data$covariateId == 12101) %>% + dplyr::pull(.data$covariateValue) + trainMin <- min(normalizedData$covariateData$covariates %>% dplyr::filter(.data$covariateId == 12101) %>% dplyr::pull(.data$covariateValue)) + trainMax <- max(normalizedData$covariateData$covariates %>% dplyr::filter(.data$covariateId == 12101) %>% dplyr::pull(.data$covariateValue)) + testNormFeature <- (testFeature - trainMin) / (trainMax - trainMin) + expect_equal(testFeature, testNormFeature) + + normalizer <- createNormalizer(type = "robust") + data <- addFeature(tinyTrainData, 12101, -10, 10) + testNormData <- addFeature(testData, 12101, -10, 10) + newTrainData <- robustNormalize(data, normalizer) + metaData <- attr(newTrainData$covariateData, "metaData") + testSettings <- metaData$featureEngineering$robustNormalize$settings$featureEngineeringSettings + newTestData <- robustNormalize(testNormData, testSettings, done = TRUE) + feature <- newTrainData$covariateData$covariates %>% + dplyr::filter(.data$covariateId == 12101) %>% + dplyr::pull(.data$covariateValue) + expect_true(all(feature >= -3) && all(feature <= 3)) + trainFeature <- data$covariateData$covariates %>% + dplyr::filter(.data$covariateId == 12101) %>% + dplyr::pull(.data$covariateValue) + testFeature <- newTestData$covariateData$covariates %>% + dplyr::filter(.data$covariateId == 12101) %>% + dplyr::pull(.data$covariateValue) + testNormFeature <- (testFeature - median(trainFeature)) / IQR(trainFeature) + testNormFeature <- testNormFeature / sqrt(1 + (testNormFeature / 2)^2) + expect_true(all(testFeature >= -3) && all(testFeature <= 3)) +}) + +test_that("createRareFeatureRemover works", { + remover <- createRareFeatureRemover(threshold = 0.1) + expect_equal(remover$threshold, 0.1) + expect_equal(attr(remover, "fun"), "removeRareFeatures") + + expect_error(createRareFeatureRemover(threshold = -1)) + expect_error(createRareFeatureRemover(threshold = "0.5")) + expect_error(createRareFeatureRemover(threshold = 1)) +}) + +test_that("Removing rare features works", { + remover <- createRareFeatureRemover(threshold = 0.1) + + removedData <- removeRareFeatures(tinyTrainData, remover) + expect_true( + removedData$covariateData$covariates %>% + dplyr::pull(.data$covariateId) %>% + dplyr::n_distinct() <= + tinyTrainData$covariateData$covariates %>% + dplyr::pull(.data$covariateId) %>% + dplyr::n_distinct() + ) + metaData <- attr(removedData$covariateData, "metaData") + testSettings <- metaData$featureEngineering$removeRare$settings$featureEngineeringSettings + + removedTestData <- removeRareFeatures(testData, remover, done = TRUE) +}) diff --git a/tests/testthat/test-imputation.R b/tests/testthat/test-imputation.R new file mode 100644 index 00000000..a03cadf8 --- /dev/null +++ b/tests/testthat/test-imputation.R @@ -0,0 +1,235 @@ +# add a test numerical feature with missing values of certain percentage +createMissingData <- function(trainData, missingness) { + missingData <- list( + folds = trainData$folds, + labels = trainData$labels + ) + missingData$covariateData <- Andromeda::copyAndromeda(trainData$covariateData) + rowIds <- missingData$labels$rowId + nData <- floor(length(rowIds) * (1 - missingness)) + covariateId <- rep(666, nData) + withr::with_seed( + 1234, + covariateValue <- runif(n = nData) + ) + Andromeda::appendToTable( + missingData$covariateData$covariates, + data.frame( + rowId = rowIds[1:nData], + covariateId = covariateId, + covariateValue = covariateValue + ) + ) + Andromeda::appendToTable( + (missingData$covariateData$covariateRef), + data.frame( + covariateId = 666, + covariateName = "fakeMissingVariable", + analysisId = 666, + conceptId = 666 + ) + ) + Andromeda::appendToTable( + missingData$covariateData$analysisRef, + data.frame( + analysisId = 666, + analysisName = "missing", + domainId = "missing", + startDay = NA, + endDay = NA, + isBinary = "N", + missingMeansZero = "N" + ) + ) + missingData +} + +test_that("createSimpleImputer works", { + imputer <- createSimpleImputer() + + expect_equal(imputer$method, "mean") + expect_equal(imputer$missingThreshold, 0.3) + expect_equal(attr(imputer, "fun"), "simpleImpute") + expect_s3_class(imputer, "featureEngineeringSettings") + + imputer <- createSimpleImputer( + method = "median", + missingThreshold = 0.5 + ) + expect_equal(imputer$method, "median") + expect_equal(imputer$missingThreshold, 0.5) + expect_s3_class(imputer, "featureEngineeringSettings") + + expect_s3_class(imputer, "featureEngineeringSettings") + expect_error(createSimpleImputer(method = "mean", missingThreshold = -1)) + expect_error(createSimpleImputer(method = "mean", missingThreshold = "0.5")) + expect_error(createSimpleImputer(method = "mean", missingThreshold = 1)) + expect_error(createSimpleImputer(method = "notMean")) +}) + +test_that("createIterativeImputer works", { + skip_if_not_installed("glmnet") + imputer <- createIterativeImputer() + + expect_equal(imputer$method, "pmm") + expect_error(createIterativeImputer(method = "notPmm")) + expect_equal(attr(imputer, "fun"), "iterativeImpute") + expect_s3_class(imputer, "featureEngineeringSettings") + expect_error(createIterativeImputer(method = "pmm", missingThreshold = -1)) + expect_error(createIterativeImputer(method = "pmm", missingThreshold = "0.5")) + expect_error(createIterativeImputer(method = "pmm", missingThreshold = 1)) + + imputer <- createIterativeImputer( + method = "pmm", + missingThreshold = 0.5 + ) + expect_equal(imputer$missingThreshold, 0.5) + expect_s3_class(imputer, "featureEngineeringSettings") +}) + +test_that("simpleImpute works", { + missingData <- createMissingData(tinyTrainData, 0.2) + + imputer <- createSimpleImputer(method = "mean", missingThreshold = 0.3) + + imputedData <- simpleImpute(missingData, imputer, done = FALSE) + + newFeature <- imputedData$covariateData$covariates %>% + dplyr::filter(.data$covariateId == 666) %>% + dplyr::pull(.data$covariateValue) + + originalFeature <- missingData$covariateData$covariates %>% + dplyr::filter(.data$covariateId == 666) + + imputedFeature <- imputedData$covariateData$covariates %>% + dplyr::filter( + .data$covariateId == 666, + !.data$rowId %in% !!(originalFeature %>% + dplyr::pull(.data$rowId)) + ) %>% + dplyr::pull(.data$covariateValue) + originalFeature <- originalFeature %>% + dplyr::pull(.data$covariateValue) + + + expect_true(length(newFeature) > length(originalFeature)) + expect_equal(length(newFeature), nrow(imputedData$labels)) + expect_equal(mean(originalFeature), unique(imputedFeature)) + + missingTestData <- createMissingData(testData, 0.2) + # extract featureEngineeringSettings from imputedData + metaData <- attr(imputedData$covariateData, "metaData") + testSettings <- metaData$featureEngineering$simpleImputer$settings$featureEngineeringSettings + + imputedTestData <- simpleImpute(missingTestData, testSettings, done = TRUE) + + newFeatureTest <- imputedTestData$covariateData$covariates %>% + dplyr::filter(.data$covariateId == 666) %>% + dplyr::pull(.data$covariateValue) + originalFeatureTest <- missingTestData$covariateData$covariates %>% + dplyr::filter(.data$covariateId == 666) + imputedFeatureTest <- imputedTestData$covariateData$covariates %>% + dplyr::filter( + .data$covariateId == 666, + !.data$rowId %in% !!(originalFeatureTest %>% + dplyr::pull(.data$rowId)) + ) %>% + dplyr::pull(.data$covariateValue) + originalFeatureTest <- originalFeatureTest %>% + dplyr::pull(.data$covariateValue) + + expect_true(length(newFeatureTest) > length(originalFeatureTest)) + expect_equal(length(newFeatureTest), nrow(imputedTestData$labels)) + # should use mean from training data + expect_equal(mean(originalFeature), unique(imputedFeatureTest)) + + imputer <- createSimpleImputer(method = "median", missingThreshold = 0.3) + + imputedData <- simpleImpute(missingData, imputer, done = FALSE) + + newFeature <- imputedData$covariateData$covariates %>% + dplyr::filter(.data$covariateId == 666) %>% + dplyr::pull(.data$covariateValue) + + originalFeature <- missingData$covariateData$covariates %>% + dplyr::filter(.data$covariateId == 666) + + imputedFeature <- imputedData$covariateData$covariates %>% + dplyr::filter( + .data$covariateId == 666, + !.data$rowId %in% !!(originalFeature %>% + dplyr::pull(.data$rowId)) + ) %>% + dplyr::pull(.data$covariateValue) + originalFeature <- originalFeature %>% + dplyr::pull(.data$covariateValue) + + expect_true(length(newFeature) > length(originalFeature)) + expect_equal(length(newFeature), nrow(imputedData$labels)) + expect_equal(median(originalFeature), unique(imputedFeature)) + + imputer <- createSimpleImputer(method = "mean", missingThreshold = 0.1) + imputedData <- simpleImpute(missingData, imputer, done = FALSE) + newFeature <- imputedData$covariateData$covariates %>% + dplyr::filter(.data$covariateId == 666) %>% + dplyr::pull(.data$covariateValue) + expect_true(length(newFeature) == 0) +}) + +test_that("IterativeImputer works", { + missingData <- createMissingData(tinyTrainData, 0.2) + imputer <- createIterativeImputer( + method = "pmm", missingThreshold = 0.3, + methodSettings = list( + pmm = list( + k = 1, + iterations = 1 + )) + ) + imputedData <- iterativeImpute(missingData, imputer, done = FALSE) + + newFeature <- imputedData$covariateData$covariates %>% + dplyr::filter(.data$covariateId == 666) %>% + dplyr::pull(.data$covariateValue) + + originalFeature <- missingData$covariateData$covariates %>% + dplyr::filter(.data$covariateId == 666) + + imputedFeature <- imputedData$covariateData$covariates %>% + dplyr::filter( + .data$covariateId == 666, + !.data$rowId %in% !!(originalFeature %>% + dplyr::pull(.data$rowId)) + ) %>% + dplyr::pull(.data$covariateValue) + originalFeature <- originalFeature %>% + dplyr::pull(.data$covariateValue) + + expect_true(length(newFeature) > length(originalFeature)) + expect_equal(length(newFeature), nrow(imputedData$labels)) + + missingTestData <- createMissingData(testData, 0.2) + # extract featureEngineeringSettings from imputedData + metaData <- attr(imputedData$covariateData, "metaData") + testSettings <- metaData$featureEngineering$iterativeImputer$settings$featureEngineeringSettings + + imputedTestData <- iterativeImpute(missingTestData, testSettings, done = TRUE) + + newFeatureTest <- imputedTestData$covariateData$covariates %>% + dplyr::filter(.data$covariateId == 666) %>% + dplyr::pull(.data$covariateValue) + originalFeatureTest <- missingTestData$covariateData$covariates %>% + dplyr::filter(.data$covariateId == 666) + imputedFeatureTest <- imputedTestData$covariateData$covariates %>% + dplyr::filter( + .data$covariateId == 666, + !.data$rowId %in% !!(originalFeatureTest %>% + dplyr::pull(.data$rowId)) + ) %>% + dplyr::pull(.data$covariateValue) + originalFeatureTest <- originalFeatureTest %>% + dplyr::pull(.data$covariateValue) + + expect_true(length(newFeatureTest) > length(originalFeatureTest)) + expect_equal(length(newFeatureTest), nrow(imputedTestData$labels)) +})