From 4e834fcbdcd3ad6e917b11b9b5e512372a7b6560 Mon Sep 17 00:00:00 2001 From: egillax Date: Wed, 4 Dec 2024 10:12:50 +0100 Subject: [PATCH] more work on imputations --- NAMESPACE | 4 + R/CyclopsModels.R | 4 +- R/PreprocessingData.R | 320 ++++++++++++++++++++++++++++++++++++------ R/RunPlp.R | 4 +- man/preprocessData.Rd | 2 +- 5 files changed, 286 insertions(+), 48 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index ad98ff62d..b80da52d4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -28,11 +28,15 @@ export(createGlmModel) export(createLearningCurve) export(createLogSettings) export(createModelDesign) +export(createNormalization) export(createPlpResultTables) export(createPreprocessSettings) export(createRandomForestFeatureSelection) +export(createRareFeatureRemover) export(createRestrictPlpDataSettings) export(createSampleSettings) +export(createSingleImputer) +export(createSklearnImputer) export(createSplineSettings) export(createStratifiedImputationSettings) export(createStudyPopulation) diff --git a/R/CyclopsModels.R b/R/CyclopsModels.R index 16f035164..e97cf0f74 100644 --- a/R/CyclopsModels.R +++ b/R/CyclopsModels.R @@ -101,9 +101,9 @@ fitCyclopsModel <- function( noiseLevel = "silent", threads = settings$threads, maxIterations = settings$maxIterations, - seed = settings$seed + seed = settings$seed, + useKKTSwindle = FALSE ) - fit <- tryCatch({ ParallelLogger::logInfo('Running Cyclops') Cyclops::fitCyclopsModel( diff --git a/R/PreprocessingData.R b/R/PreprocessingData.R index 834d27a85..bd065ae45 100644 --- a/R/PreprocessingData.R +++ b/R/PreprocessingData.R @@ -2,13 +2,13 @@ # Copyright 2021 Observational Health Data Sciences and Informatics # # This file is part of PatientLevelPrediction -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,34 +20,31 @@ #' @details #' Returns an object of class \code{preprocessingSettings} that specifies how to preprocess the training data #' -#' @param minFraction The minimum fraction of target population who must have a covariate for it to be included in the model training +#' @param minFraction The minimum fraction of target population who must have a covariate for it to be included in the model training #' @param normalize Whether to normalise the covariates before training (Default: TRUE) #' @param removeRedundancy Whether to remove redundant features (Default: TRUE) #' @return #' An object of class \code{preprocessingSettings} #' @export createPreprocessSettings <- function( - minFraction = 0.001, - normalize = TRUE, - removeRedundancy = TRUE - ){ - - checkIsClass(minFraction, c('numeric','integer')) - checkHigherEqual(minFraction,0) - - checkIsClass(normalize, c("logical")) - - checkIsClass(removeRedundancy, c("logical")) - - preprocessingSettings <- list( - minFraction = minFraction, - normalize = normalize, - removeRedundancy = removeRedundancy - ) - + minFraction = 0.001, + normalize = TRUE, + removeRedundancy = TRUE) { + checkIsClass(minFraction, c("numeric", "integer")) + checkHigherEqual(minFraction, 0) + + checkIsClass(normalize, c("logical")) + + checkIsClass(removeRedundancy, c("logical")) + + preprocessingSettings <- list( + minFraction = minFraction, + normalize = normalize, + removeRedundancy = removeRedundancy + ) + class(preprocessingSettings) <- "preprocessSettings" return(preprocessingSettings) - } @@ -57,39 +54,274 @@ createPreprocessSettings <- function( #' @details #' Returns an object of class \code{covariateData} that has been processed #' -#' @param covariateData The covariate part of the training data created by \code{splitData} after being sampled and having -#' any required feature engineering -#' @param preprocessSettings The settings for the preprocessing created by \code{createPreprocessSettings} +#' @param covariateData The covariate part of the training data created by \code{splitData} after being sampled and having +#' any required feature engineering +#' @param preprocessSettings The settings for the preprocessing created by \code{createPreprocessSettings} #' @return -#' The data processed -preprocessData <- function (covariateData, - preprocessSettings){ - +#' The data processed +preprocessData <- function(covariateData, + preprocessSettings) { metaData <- attr(covariateData, "metaData") preprocessSettingsInput <- preprocessSettings # saving this before adding covariateData - + checkIsClass(covariateData, c("CovariateData")) checkIsClass(preprocessSettings, c("preprocessSettings")) - - ParallelLogger::logDebug(paste0('minFraction: ', preprocessSettings$minFraction)) - ParallelLogger::logDebug(paste0('normalize: ', preprocessSettings$normalize)) - ParallelLogger::logDebug(paste0('removeRedundancy: ', preprocessSettings$removeRedundancy)) - + + ParallelLogger::logDebug(paste0("minFraction: ", preprocessSettings$minFraction)) + ParallelLogger::logDebug(paste0("normalize: ", preprocessSettings$normalize)) + ParallelLogger::logDebug(paste0("removeRedundancy: ", preprocessSettings$removeRedundancy)) + preprocessSettings$covariateData <- covariateData covariateData <- do.call(FeatureExtraction::tidyCovariateData, preprocessSettings) - - #update covariateRef + + # update covariateRef removed <- unique(c( attr(covariateData, "metaData")$deletedInfrequentCovariateIds, attr(covariateData, "metaData")$deletedRedundantCovariateIds - ) - ) - covariateData$covariateRef <- covariateData$covariateRef %>% - dplyr::filter(!.data$covariateId %in% removed) - + )) + covariateData$covariateRef <- covariateData$covariateRef %>% + dplyr::filter(!.data$covariateId %in% removed) + metaData$tidyCovariateDataSettings <- attr(covariateData, "metaData") metaData$preprocessSettings <- preprocessSettingsInput attr(covariateData, "metaData") <- metaData - + return(covariateData) } + +#' A function that normalizes continous features to have values between 0 and 1 +#' @details uses value - min / (max - min) to normalize the data +#' @param trainData The training data to be normalized +#' @param featureEngineeringSettings The settings for the normalization +#' @param normalized Whether the data has already been normalized (bool) +#' @return The normalized data +minMaxNormalize <- function(trainData, featureEngineeringSettings, normalized = FALSE) { + if (!normalized) { + # fit the normalization + # find continuous features from trainData$covariateData$analysisRef + continousFeatures <- trainData$covariateData$analysisRef %>% + dplyr::filter(.data$isBinary == "N") %>% + dplyr::select("analysisId") %>% + dplyr::inner_join(trainData$covariateData$covariateRef, by = "analysisId") %>% + dplyr::pull(.data$covariateId) + + # get max of each feature + trainData$covariateData$minMaxs <- trainData$covariateData$covariates %>% + dplyr::filter(.data$covariateId %in% continousFeatures) %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise( + max = max(.data$covariateValue, na.rm = TRUE), + min = min(.data$covariateValue, na.rm = TRUE) + ) %>% + dplyr::collect() + + # save the normalization + attr(featureEngineeringSettings, "minMaxs") <- + trainData$covariateData$minMaxs %>% dplyr::collect() + + # apply the normalization to trainData + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::left_join(trainData$covariateData$minMaxs, by = "covariateId") %>% + # use ifelse to only normalize if min and max are not NA as is the case + # for continous features, else return original value + dplyr::mutate(covariateValue = ifelse(!is.na(min) & !is.na(max), + (.data$covariateValue - min) / (max - min), + .data$covariateValue + )) %>% + dplyr::select(-c("max", "min")) + trainData$covariateData$minMaxs <- NULL + normalized <- TRUE + } else { + # apply the normalization to test data by using saved normalization values + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::left_join(attr(featureEngineeringSettings, "minMaxs"), + by = "covariateId", copy = TRUE + ) %>% + dplyr::mutate(covariateValue = ifelse(!is.na(min) & !is.na(max), + (.data$covariateValue - min) / (max - min), + .data$covariateValue + )) %>% + dplyr::select(-c("max", "min")) + } + featureEngineering <- list( + funct = "minMaxNormalize", + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + normalized = normalized + ) + ) + + attr(trainData, "metaData")$featureEngineering <- listAppend( + attr(trainData, "metaData")$featureEngineering, + featureEngineering + ) + return(trainData) +} + +#' A function that normalizes continous by the interquartile range and forces +#' the resulting values to be between -3 and 3 with f(x) = x / sqrt(1 + (x/3)^2) +#' @details uses (value - median) / iqr to normalize the data and then +#' applies the function f(x) = x / sqrt(1 + (x/3)^2) to the normalized values. +#' This forces the values to be between -3 and 3 while preserving the relative +#' ordering of the values.' +#' based on https://arxiv.org/abs/2407.04491 for more details +#' @param trainData The training data to be normalized +#' @param featureEngineeringSettings The settings for the normalization +#' @param normalized Whether the data has already been normalized (bool) +robustNormalize <- function(trainData, featureEngineeringSettings, normalized = FALSE) { + if (!normalized) { + # find continuous features from trainData$covariateData$analysisRef + continousFeatures <- trainData$covariateData$analysisRef %>% + dplyr::filter(.data$isBinary == "N") %>% + dplyr::select("analysisId") %>% + dplyr::inner_join(trainData$covariateData$covariateRef, by = "analysisId") %>% + dplyr::pull(.data$covariateId) + + # get (25, 75)% quantiles of each feature + # sqlite (used by Andromeda) doesn't have quantile function, so we need to load the extension + # to get upper_quartile and lower_quartile_functions + con <- trainData$covariateData$covariates %>% dbplyr::remote_con() + RSQLite::initExtension(con) + + trainData$covariateData$quantiles <- trainData$covariateData$covariates %>% + dplyr::filter(.data$covariateId %in% continousFeatures) %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise( + q25 = dplyr::sql("lower_quartile(covariateValue)"), + q75 = dplyr::sql("upper_quartile(covariateValue)"), + median = median(.data$covariateValue, na.rm = TRUE) + ) %>% + dplyr::mutate(iqr = .data$q75 - .data$q25) %>% + dplyr::select(-c("q75", "q25")) %>% + dplyr::collect() + + # save the normalization + attr(featureEngineeringSettings, "quantiles") <- + trainData$covariateData$quantiles %>% dplyr::collect() + + # apply the normalization to trainData + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::left_join(trainData$covariateData$quantiles, by = "covariateId") %>% + # use ifelse to only normalize continous features + dplyr::mutate(covariateValue = ifelse(!is.na(.data$iqr) & !is.na(.data$median), + (.data$covariateValue - .data$median) / .data$iqr, + .data$covariateValue + )) %>% + # smoothly clip the range to [-3, 3] with x / sqrt(1 + (x/3)^2) + # ref: https://arxiv.org/abs/2407.04491 + dplyr::mutate(covariateValue = ifelse(!is.na(.data$iqr) & !is.na(.data$median), + .data$covariateValue / sqrt(1 + (.data$covariateValue / 3)^2), + .data$covariateValue + )) %>% + dplyr::select(-c("median", "iqr")) + trainData$covariateData$quantiles <- NULL + normalized <- TRUE + } else { + # apply the normalization to test data by using saved normalization values + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::left_join(attr(featureEngineeringSettings, "quantiles"), + by = "covariateId", copy = TRUE + ) %>% + dplyr::mutate(covariateValue = ifelse(!is.na(.data$iqr) & !is.na(.data$median), + (.data$covariateValue - .data$median) / .data$iqr, + .data$covariateValue + )) %>% + dplyr::mutate(covariateValue = ifelse(!is.na(.data$iqr) & !is.na(.data$median), + .data$covariateValue / sqrt(1 + (.data$covariateValue / 3)^2), + .data$covariateValue + )) %>% + dplyr::select(-c("median", "iqr")) + } + featureEngineering <- list( + funct = "robustNormalize", + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + normalized = normalized + ) + ) + + attr(trainData, "metaData")$featureEngineering <- listAppend( + attr(trainData, "metaData")$featureEngineering, + featureEngineering + ) + return(trainData) +} + +#' Create the settings for normalizing the data +#' @param type The type of normalization to use, either "minmax" or "robust" +#' @return An object of class \code{featureEngineeringSettings} +#' @export +createNormalization <- function(type = "minmax") { + featureEngineeringSettings <- list( + type = type + ) + if (type == "minmax") { + attr(featureEngineeringSettings, "fun") <- "minMaxNormalize" + } else if (type == "robust") { + attr(featureEngineeringSettings, "fun") <- "robustNormalize" + } + + class(featureEngineeringSettings) <- "featureEngineeringSettings" + return(featureEngineeringSettings) +} + +#' A function that removes rare features from the data +#' @details removes features that are present in less than a certain fraction of the population +#' @param trainData The data to be normalized +#' @param featureEngineeringSettings The settings for the normalization +#' @param findRare Whether to find and remove rare features or remove them only (bool) +removeRareFeatures <- function(trainData, featureEngineeringSettings, findRare = FALSE) { + if (!findRare) { + rareFeatures <- trainData$covariateData$covariates %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise(count = dplyr::n()) %>% + dplyr::collect() + rareFeatures <- rareFeatures %>% + dplyr::mutate(ratio = .data$count / ( + trainData$covariateData$covariates %>% + dplyr::summarise(popSize = dplyr::n_distinct(.data$rowId)) %>% + dplyr::pull() + )) %>% + dplyr::filter(.data$ratio <= featureEngineeringSettings$ratio) %>% + dplyr::pull(c("covariateId")) + + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::filter(!.data$covariateId %in% rareFeatures) + + attr(featureEngineeringSettings, "rareFeatures") <- rareFeatures + + findRare <- TRUE + } else { + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::filter( + !.data$covariateId %in% !!attr(featureEngineeringSettings, "rareFeatures") + ) + } + featureEngineering <- list( + funct = "removeRareFeatures", + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + findRare = findRare + ) + ) + attr(trainData, "metaData")$featureEngineering <- listAppend( + attr(trainData, "metaData")$featureEngineering, + featureEngineering + ) + return(trainData) +} + +#' Create the settings for removing rare features +#' @param ratio The minimum fraction of the training data that must have a +#' feature for it to be included +#' @return An object of class \code{featureEngineeringSettings} +#' @export +createRareFeatureRemover <- function(ratio = 0.001) { + featureEngineeringSettings <- list( + ratio = ratio + ) + attr(featureEngineeringSettings, "fun") <- "removeRareFeatures" + + class(featureEngineeringSettings) <- "featureEngineeringSettings" + return(featureEngineeringSettings) +} diff --git a/R/RunPlp.R b/R/RunPlp.R index 2d7119388..7bae85020 100644 --- a/R/RunPlp.R +++ b/R/RunPlp.R @@ -213,6 +213,7 @@ runPlp <- function( executeSettings = createDefaultExecuteSettings(), saveDirectory = getwd() ){ + start <- Sys.time() # start log analysisPath <- file.path(saveDirectory, analysisId) @@ -509,7 +510,8 @@ runPlp <- function( class(results) <- c('runPlp') ParallelLogger::logInfo("Run finished successfully.") - + end <- Sys.time() + ParallelLogger::logInfo(paste0('Total time taken: ', end - start)) # save the results ParallelLogger::logInfo(paste0('Saving PlpResult')) tryCatch(savePlpResult(results, file.path(analysisPath,'plpResult')), diff --git a/man/preprocessData.Rd b/man/preprocessData.Rd index 2ace52208..4f9b18eb2 100644 --- a/man/preprocessData.Rd +++ b/man/preprocessData.Rd @@ -8,7 +8,7 @@ and remove rare or redundant features} preprocessData(covariateData, preprocessSettings) } \arguments{ -\item{covariateData}{The covariate part of the training data created by \code{splitData} after being sampled and having +\item{covariateData}{The covariate part of the training data created by \code{splitData} after being sampled and having any required feature engineering} \item{preprocessSettings}{The settings for the preprocessing created by \code{createPreprocessSettings}}