From 0645a72c075a37b7ba91cbdbfddd3656eab30c4b Mon Sep 17 00:00:00 2001 From: egillax Date: Mon, 9 Dec 2024 17:17:02 +0100 Subject: [PATCH] fix flaky imputation test --- R/FeatureEngineering.R | 441 +++++++++++------------ R/HelperFunctions.R | 8 - tests/testthat/helper-functions.R | 52 +-- tests/testthat/test-featureEngineering.R | 262 +++++++------- tests/testthat/test-population.R | 6 +- 5 files changed, 372 insertions(+), 397 deletions(-) diff --git a/R/FeatureEngineering.R b/R/FeatureEngineering.R index a5f4bc1ca..76e23da13 100644 --- a/R/FeatureEngineering.R +++ b/R/FeatureEngineering.R @@ -2,13 +2,13 @@ # Copyright 2021 Observational Health Data Sciences and Informatics # # This file is part of PatientLevelPrediction -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -16,29 +16,29 @@ # limitations under the License. -featureEngineer <- function(data, featureEngineeringSettings){ - - ParallelLogger::logInfo('Starting Feature Engineering') - +featureEngineer <- function(data, featureEngineeringSettings) { + ParallelLogger::logInfo("Starting Feature Engineering") + # if a single setting, make it a list - if(inherits(featureEngineeringSettings, 'featureEngineeringSettings')){ + if (inherits(featureEngineeringSettings, "featureEngineeringSettings")) { featureEngineeringSettings <- list(featureEngineeringSettings) } - - for(featureEngineeringSetting in featureEngineeringSettings){ + + for (featureEngineeringSetting in featureEngineeringSettings) { fun <- attr(featureEngineeringSetting, "fun") - args <- list(trainData = data, - featureEngineeringSettings = featureEngineeringSetting) - ParallelLogger::logInfo(paste0('Applying ',fun)) + args <- list( + trainData = data, + featureEngineeringSettings = featureEngineeringSetting + ) + ParallelLogger::logInfo(paste0("Applying ", fun)) data <- do.call(eval(parse(text = fun)), args) } - - attr(data, 'metaData')$featureEngineeringSettings <- featureEngineeringSettings - - ParallelLogger::logInfo('Done Feature Engineering') - + + attr(data, "metaData")$featureEngineeringSettings <- featureEngineeringSettings + + ParallelLogger::logInfo("Done Feature Engineering") + return(data) - } #' Create the settings for defining any feature engineering that will be done @@ -47,22 +47,20 @@ featureEngineer <- function(data, featureEngineeringSettings){ #' Returns an object of class \code{featureEngineeringSettings} that specifies the sampling function that will be called and the settings #' #' @param type (character) Choice of: \itemize{ -#' \item'none' No feature engineering - this is the default -#' } +#' \item'none' No feature engineering - this is the default +#' } #' #' @return #' An object of class \code{featureEngineeringSettings} #' @export -createFeatureEngineeringSettings <- function(type = 'none'){ - +createFeatureEngineeringSettings <- function(type = "none") { featureEngineeringSettings <- list() - - if(type == 'none'){ + + if (type == "none") { attr(featureEngineeringSettings, "fun") <- "sameData" } class(featureEngineeringSettings) <- "featureEngineeringSettings" return(featureEngineeringSettings) - } @@ -76,22 +74,20 @@ createFeatureEngineeringSettings <- function(type = 'none'){ #' @return #' An object of class \code{featureEngineeringSettings} #' @export -createUnivariateFeatureSelection <- function(k = 100){ - - if (inherits(k, 'numeric')) { +createUnivariateFeatureSelection <- function(k = 100) { + if (inherits(k, "numeric")) { k <- as.integer(k) } - - checkIsClass(k, 'integer') + + checkIsClass(k, "integer") checkHigherEqual(k, 0) - - featureEngineeringSettings <- list(k = k) - + + featureEngineeringSettings <- list(k = k) + attr(featureEngineeringSettings, "fun") <- "univariateFeatureSelection" class(featureEngineeringSettings) <- "featureEngineeringSettings" - + return(featureEngineeringSettings) - } #' Create the settings for random foreat based feature selection @@ -105,21 +101,20 @@ createUnivariateFeatureSelection <- function(k = 100){ #' @return #' An object of class \code{featureEngineeringSettings} #' @export -createRandomForestFeatureSelection <- function(ntrees = 2000, maxDepth = 17){ - - checkIsClass(ntrees, c('numeric','integer')) - checkIsClass(maxDepth, c('numeric','integer')) +createRandomForestFeatureSelection <- function(ntrees = 2000, maxDepth = 17) { + checkIsClass(ntrees, c("numeric", "integer")) + checkIsClass(maxDepth, c("numeric", "integer")) checkHigher(ntrees, 0) checkHigher(maxDepth, 0) - + featureEngineeringSettings <- list( ntrees = ntrees, max_depth = maxDepth - ) - + ) + attr(featureEngineeringSettings, "fun") <- "randomForestFeatureSelection" class(featureEngineeringSettings) <- "featureEngineeringSettings" - + return(featureEngineeringSettings) } @@ -130,7 +125,7 @@ createRandomForestFeatureSelection <- function(ntrees = 2000, maxDepth = 17){ #' #' @param continousCovariateId The covariateId to apply splines to #' @param knots Either number of knots of vector of split values -#' @param analysisId The analysisId to use for the spline covariates +#' @param analysisId The analysisId to use for the spline covariates #' #' @return #' An object of class \code{featureEngineeringSettings} @@ -138,34 +133,29 @@ createRandomForestFeatureSelection <- function(ntrees = 2000, maxDepth = 17){ createSplineSettings <- function( continousCovariateId, knots, - analysisId = 683 - ){ - - checkIsClass(continousCovariateId, c('numeric','integer')) - checkIsClass(knots, c('numeric','integer')) - + analysisId = 683) { + checkIsClass(continousCovariateId, c("numeric", "integer")) + checkIsClass(knots, c("numeric", "integer")) + featureEngineeringSettings <- list( continousCovariateId = continousCovariateId, knots = knots, analysisId = analysisId ) - + attr(featureEngineeringSettings, "fun") <- "splineCovariates" class(featureEngineeringSettings) <- "featureEngineeringSettings" - + return(featureEngineeringSettings) } - + splineCovariates <- function( - trainData, + trainData, featureEngineeringSettings, - knots = NULL - ){ - - ParallelLogger::logInfo('Starting splineCovariates') - - if(is.null(knots)){ - + knots = NULL) { + ParallelLogger::logInfo("Starting splineCovariates") + + if (is.null(knots)) { if (length(featureEngineeringSettings$knots) == 1) { measurements <- trainData$covariateData$covariates %>% dplyr::filter(.data$covariateId == !!featureEngineeringSettings$continousCovariateId) %>% @@ -175,112 +165,109 @@ splineCovariates <- function( } else { knots <- featureEngineeringSettings$knots } - } - - # apply the spline mapping + + # apply the spline mapping trainData <- splineMap( data = trainData, covariateId = featureEngineeringSettings$continousCovariateId, analysisId = featureEngineeringSettings$analysisId, knots = knots - ) + ) featureEngineering <- list( - funct = 'splineCovariates', + funct = "splineCovariates", settings = list( featureEngineeringSettings = featureEngineeringSettings, knots = knots ) ) - + # add the feature engineering in - attr(trainData, 'metaData')$featureEngineering = listAppend( - attr(trainData, 'metaData')$featureEngineering, + attr(trainData, "metaData")$featureEngineering <- listAppend( + attr(trainData, "metaData")$featureEngineering, featureEngineering ) - ParallelLogger::logInfo('Finished splineCovariates') - + ParallelLogger::logInfo("Finished splineCovariates") + return(trainData) } # create the spline map to add spline columns splineMap <- function( - data, + data, covariateId, analysisId, - knots -){ - - ParallelLogger::logInfo('Starting splineMap') + knots) { + ParallelLogger::logInfo("Starting splineMap") measurements <- data$covariateData$covariates %>% dplyr::filter(.data$covariateId == !!covariateId) %>% as.data.frame() - + designMatrix <- splines::bs( - x = measurements$covariateValue,#knots[1]:knots[length(knots)], + x = measurements$covariateValue, # knots[1]:knots[length(knots)], knots = knots[2:(length(knots) - 1)], Boundary.knots = knots[c(1, length(knots))] ) - + data$covariateData$covariates <- data$covariateData$covariates %>% dplyr::filter(.data$covariateId != !!covariateId) - + # get the covariate name details <- data$covariateData$covariateRef %>% dplyr::filter(.data$covariateId == !!covariateId) %>% as.data.frame() covariateName <- details$covariateName - + data$covariateData$covariateRef <- data$covariateData$covariateRef %>% dplyr::filter(.data$covariateId != !!covariateId) - + # remove last 3 numbers as this was old analysis id - covariateId <- floor(covariateId/1000) - + covariateId <- floor(covariateId / 1000) + # add the spline columns - for(i in 1:ncol(designMatrix)){ + for (i in 1:ncol(designMatrix)) { Andromeda::appendToTable( - tbl = data$covariateData$covariates, + tbl = data$covariateData$covariates, data = data.frame( rowId = measurements$rowId, - covariateId = covariateId*10000+i*1000+analysisId, - covariateValue = designMatrix[,i] + covariateId = covariateId * 10000 + i * 1000 + analysisId, + covariateValue = designMatrix[, i] ) ) } - + # add the covariates to the ref table Andromeda::appendToTable( tbl = data$covariateData$covariateRef, data = data.frame( - covariateId = covariateId*10000+(1:(ncol(designMatrix)))*1000+analysisId, + covariateId = covariateId * 10000 + (1:(ncol(designMatrix))) * 1000 + analysisId, covariateName = paste( - paste0(covariateName," spline component "), + paste0(covariateName, " spline component "), 1:ncol(designMatrix) ), conceptId = 0, analysisId = analysisId ) ) - + # add analysisRef for the first time a spline is added analysisRef <- data$covariateData$analysisRef %>% as.data.frame() - if(!analysisId %in% analysisRef$analysisId){ + if (!analysisId %in% analysisRef$analysisId) { Andromeda::appendToTable( tbl = data$covariateData$analysisRef, data = data.frame( analysisId = analysisId, - analysisName = 'splines', - domainId = 'feature engineering', + analysisName = "splines", + domainId = "feature engineering", startDay = 0, endDay = 0, - isBinary = 'N', - missingMeansZero = 'N' + isBinary = "N", + missingMeansZero = "N" ) ) } - ParallelLogger::logInfo('Finished splineMap') + ParallelLogger::logInfo("Finished splineMap") return(data) } @@ -299,251 +286,233 @@ splineMap <- function( #' @export createStratifiedImputationSettings <- function( covariateId, - ageSplits = NULL -){ - - checkIsClass(covariateId, c('numeric','integer')) - checkIsClass(ageSplits, c('numeric','integer')) - + ageSplits = NULL) { + checkIsClass(covariateId, c("numeric", "integer")) + checkIsClass(ageSplits, c("numeric", "integer")) + featureEngineeringSettings <- list( covariateId = covariateId, ageSplits = ageSplits ) - + attr(featureEngineeringSettings, "fun") <- "stratifiedImputeCovariates" class(featureEngineeringSettings) <- "featureEngineeringSettings" - + return(featureEngineeringSettings) } stratifiedImputeCovariates <- function( - trainData, + trainData, featureEngineeringSettings, - stratifiedMeans = NULL -){ - - if(is.null(stratifiedMeans)){ - + stratifiedMeans = NULL) { + if (is.null(stratifiedMeans)) { stratifiedMeans <- calculateStratifiedMeans( trainData = trainData, featureEngineeringSettings = featureEngineeringSettings ) - } - + trainData <- imputeMissingMeans( - trainData = trainData, + trainData = trainData, covariateId = featureEngineeringSettings$covariateId, ageSplits = featureEngineeringSettings$ageSplits, stratifiedMeans = stratifiedMeans - ) - + ) + return(trainData) } calculateStratifiedMeans <- function( trainData, - featureEngineeringSettings -){ - if(is.null(featureEngineeringSettings$ageSplits)){ - trainData$cohorts$ageGroup <- floor(trainData$cohorts$ageYear/5) - } else{ + featureEngineeringSettings) { + if (is.null(featureEngineeringSettings$ageSplits)) { + trainData$cohorts$ageGroup <- floor(trainData$cohorts$ageYear / 5) + } else { trainData$cohorts$ageGroup <- rep(0, length(trainData$cohorts$ageYear)) - for(i in 1:length(featureEngineeringSettings$ageSplits)){ + for (i in seq_along(featureEngineeringSettings$ageSplits)) { trainData$cohorts$ageGroup[trainData$cohorts$ageYear > featureEngineeringSettings$ageSplits[i]] <- i } } - - trainData$covariateData$cohorts <- trainData$cohorts[,c('rowId', 'ageGroup', 'gender')] - + + trainData$covariateData$cohorts <- trainData$cohorts[, c("rowId", "ageGroup", "gender")] + stratifiedMeans <- trainData$covariateData$covariates %>% dplyr::filter(.data$covariateId == !!featureEngineeringSettings$covariateId) %>% dplyr::inner_join( - y = trainData$covariateData$cohorts, - by = c('rowId') + y = trainData$covariateData$cohorts, + by = c("rowId") ) %>% dplyr::group_by(.data$ageGroup, .data$gender) %>% dplyr::summarise(covariateValue = mean(.data$covariateValue, na.rm = TRUE)) %>% as.data.frame() - + return(stratifiedMeans) } imputeMissingMeans <- function( - trainData, - covariateId, - ageSplits, - stratifiedMeans -){ - - if(is.null(ageSplits)){ - trainData$cohorts$ageGroup <- floor(trainData$cohorts$ageYear/5) - } else{ + trainData, + covariateId, + ageSplits, + stratifiedMeans) { + if (is.null(ageSplits)) { + trainData$cohorts$ageGroup <- floor(trainData$cohorts$ageYear / 5) + } else { trainData$cohorts$ageGroup <- rep(0, length(trainData$cohorts$ageYear)) - for(i in 1:length(ageSplits)){ + for (i in seq_along(ageSplits)) { trainData$cohorts$ageGroup[trainData$cohorts$ageYear > ageSplits[i]] <- i } } - + rowIdsWithValues <- trainData$covariateData$covariates %>% - dplyr::filter(.data$covariateId == !! covariateId) %>% - dplyr::select('rowId') %>% + dplyr::filter(.data$covariateId == !!covariateId) %>% + dplyr::select("rowId") %>% dplyr::pull() rowIdsWithMissingValues <- trainData$cohorts$rowId[!trainData$cohorts$rowId %in% rowIdsWithValues] - - imputedData <- trainData$cohorts %>% + + imputedData <- trainData$cohorts %>% dplyr::filter(.data$rowId %in% rowIdsWithMissingValues) %>% - dplyr::select('rowId', 'ageGroup', 'gender') %>% + dplyr::select("rowId", "ageGroup", "gender") %>% dplyr::left_join( - y = stratifiedMeans, - by = c('ageGroup', 'gender') - ) %>% + y = stratifiedMeans, + by = c("ageGroup", "gender") + ) %>% dplyr::mutate( covariateId = !!covariateId, covariateValue = .data$covariateValue - ) %>% - dplyr::select('rowId', 'covariateId', 'covariateValue') - + ) %>% + dplyr::select("rowId", "covariateId", "covariateValue") + Andromeda::appendToTable( - tbl = trainData$covariateData$covariates, + tbl = trainData$covariateData$covariates, data = imputedData ) - + return(trainData) } univariateFeatureSelection <- function( - trainData, - featureEngineeringSettings, - covariateIdsInclude = NULL){ - - if(is.null(covariateIdsInclude)){ - #convert data into matrix: + trainData, + featureEngineeringSettings, + covariateIdsInclude = NULL) { + if (is.null(covariateIdsInclude)) { + # convert data into matrix: mappedData <- toSparseM(trainData, trainData$labels) - + matrixData <- mappedData$dataMatrix labels <- mappedData$labels covariateMap <- mappedData$covariateMap - + X <- reticulate::r_to_py(matrixData) - y <- reticulate::r_to_py(labels[,'outcomeCount']) - - np <- reticulate::import('numpy') - os <- reticulate::import('os') - sys <- reticulate::import('sys') - math <- reticulate::import('math') - scipy <- reticulate::import('scipy') - - sklearn <- reticulate::import('sklearn') - + y <- reticulate::r_to_py(labels[, "outcomeCount"]) + + np <- reticulate::import("numpy") + os <- reticulate::import("os") + sys <- reticulate::import("sys") + math <- reticulate::import("math") + scipy <- reticulate::import("scipy") + + sklearn <- reticulate::import("sklearn") + SelectKBest <- sklearn$feature_selection$SelectKBest chi2 <- sklearn$feature_selection$chi2 - + kbest <- SelectKBest(chi2, k = featureEngineeringSettings$k)$fit(X, y$outcomeCount) kbest$scores_ <- np$nan_to_num(kbest$scores_) # taken from sklearn code, matches the application during transform call k <- featureEngineeringSettings$k - mask <- np$zeros(length(kbest$scores_), dtype='bool') - mask[np$argsort(kbest$scores_, kind="mergesort")+1][(length(kbest$scores_)-k+1):length(kbest$scores_)] <- TRUE - - covariateIdsInclude <- covariateMap[mask,]$covariateId + mask <- np$zeros(length(kbest$scores_), dtype = "bool") + mask[np$argsort(kbest$scores_, kind = "mergesort") + 1][(length(kbest$scores_) - k + 1):length(kbest$scores_)] <- TRUE + + covariateIdsInclude <- covariateMap[mask, ]$covariateId } - - trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% dplyr::filter(.data$covariateId %in% covariateIdsInclude) - - trainData$covariateData$covariateRef <- trainData$covariateData$covariateRef %>% + + trainData$covariateData$covariateRef <- trainData$covariateData$covariateRef %>% dplyr::filter(.data$covariateId %in% covariateIdsInclude) - + featureEngineering <- list( - funct = 'univariateFeatureSelection', + funct = "univariateFeatureSelection", settings = list( featureEngineeringSettings = featureEngineeringSettings, covariateIdsInclude = covariateIdsInclude ) ) - - attr(trainData, 'metaData')$featureEngineering = listAppend( - attr(trainData, 'metaData')$featureEngineering, + + attr(trainData, "metaData")$featureEngineering <- listAppend( + attr(trainData, "metaData")$featureEngineering, featureEngineering ) - + return(trainData) - } randomForestFeatureSelection <- function( - trainData, - featureEngineeringSettings, - covariateIdsInclude = NULL -){ - - if(is.null(covariateIdsInclude)){ - #convert data into matrix: + trainData, + featureEngineeringSettings, + covariateIdsInclude = NULL) { + if (is.null(covariateIdsInclude)) { + # convert data into matrix: mappedData <- toSparseM(trainData) - + matrixData <- mappedData$dataMatrix labels <- mappedData$labels covariateMap <- mappedData$covariateMap - + X <- reticulate::r_to_py(matrixData) - y <- reticulate::r_to_py(matrix(labels$outcomeCount, ncol=1)) - - np <- reticulate::import('numpy') - os <- reticulate::import('os') - sys <- reticulate::import('sys') - math <- reticulate::import('math') - scipy <- reticulate::import('scipy') - - sklearn <- reticulate::import('sklearn') - - ntrees = featureEngineeringSettings$ntrees #2000 - max_depth = featureEngineeringSettings$max_depth #17 - - rf = sklearn$ensemble$RandomForestClassifier( - max_features = 'sqrt', + y <- reticulate::r_to_py(matrix(labels$outcomeCount, ncol = 1)) + + np <- reticulate::import("numpy") + os <- reticulate::import("os") + sys <- reticulate::import("sys") + math <- reticulate::import("math") + scipy <- reticulate::import("scipy") + + sklearn <- reticulate::import("sklearn") + + ntrees <- featureEngineeringSettings$ntrees # 2000 + max_depth <- featureEngineeringSettings$max_depth # 17 + + rf <- sklearn$ensemble$RandomForestClassifier( + max_features = "sqrt", n_estimators = as.integer(ntrees), max_depth = as.integer(max_depth), - min_samples_split = as.integer(2), + min_samples_split = as.integer(2), random_state = as.integer(10), # make this an imput for consistency - n_jobs = as.integer(-1), + n_jobs = as.integer(-1), bootstrap = F ) - - rf = rf$fit(X, y$ravel()) - - inc <- rf$feature_importances_ > 0 - + + rf <- rf$fit(X, y$ravel()) + + inc <- rf$feature_importances_ > 0 + covariateIdsInclude <- covariateMap$covariateId[inc] - } - - trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + } + + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% dplyr::filter(.data$covariateId %in% covariateIdsInclude) - - trainData$covariateData$covariateRef <- trainData$covariateData$covariateRef %>% + + trainData$covariateData$covariateRef <- trainData$covariateData$covariateRef %>% dplyr::filter(.data$covariateId %in% covariateIdsInclude) - - + + featureEngeering <- list( - funct = 'randomForestFeatureSelection', + funct = "randomForestFeatureSelection", settings = list( featureEngineeringSettings = featureEngineeringSettings, covariateIdsInclude = covariateIdsInclude ) ) - - attr(trainData, 'metaData')$featureEngineering = listAppend( - attr(trainData, 'metaData')$featureEngineering, + + attr(trainData, "metaData")$featureEngineering <- listAppend( + attr(trainData, "metaData")$featureEngineering, featureEngeering ) - + return(trainData) - } - - - - diff --git a/R/HelperFunctions.R b/R/HelperFunctions.R index d73a4e9f0..ca74db8f5 100644 --- a/R/HelperFunctions.R +++ b/R/HelperFunctions.R @@ -1,11 +1,3 @@ -# fix issue with nrow - temp fix for me locally -nrow <- function(x){UseMethod("nrow",x)} -#' @exportS3Method NULL -nrow.default <- base::nrow -#' @exportS3Method NULL -nrow.tbl <- function(x){x %>% dplyr::tally() %>% dplyr::pull()} - - removeInvalidString <- function(string){ modString <- gsub('_', ' ', string) modString <- gsub('\\.', ' ', modString) diff --git a/tests/testthat/helper-functions.R b/tests/testthat/helper-functions.R index 7170cf2aa..3ed12a2bf 100644 --- a/tests/testthat/helper-functions.R +++ b/tests/testthat/helper-functions.R @@ -1,9 +1,9 @@ # helper functions for tests -# copies trainData and makes sure andromeda object is copied correctly +# copies trainData and makes sure andromeda object is copied correctly copyTrainData <- function(trainData) { newTrainData <- trainData - + # force andromeda to copy newTrainData$covariateData <- Andromeda::copyAndromeda(trainData$covariateData) @@ -12,21 +12,29 @@ copyTrainData <- function(trainData) { } # create tiny dataset with subset of covariates based on lasso fit -createTinyPlpData <- function(plpData, plpResult, n= 20) { - - covariates <- plpResult$model$covariateImportance %>% - dplyr::slice_max(order_by = abs(.data$covariateValue), - n = n, with_ties = F) %>% +createTinyPlpData <- function(plpData, plpResult, n = 20) { + covariates <- plpResult$model$covariateImportance %>% + dplyr::slice_max( + order_by = abs(.data$covariateValue), + n = n, with_ties = FALSE + ) %>% dplyr::pull(.data$covariateId) tinyPlpData <- plpData tinyPlpData$covariateData <- Andromeda::copyAndromeda(plpData$covariateData) - - tinyPlpData$covariateData$covariates <- plpData$covariateData$covariates %>% - dplyr::filter(covariateId %in% covariates) - tinyPlpData$covariateData$covariateRef <- plpData$covariateData$covariateRef %>% - dplyr::filter(covariateId %in% covariates) - - attributes(tinyPlpData$covariateData)$metaData <- attributes(plpData$covariateData)$metaData + + tinyPlpData$covariateData$covariates <- plpData$covariateData$covariates %>% + dplyr::filter(.data$covariateId %in% covariates) + tinyPlpData$covariateData$covariateRef <- plpData$covariateData$covariateRef %>% + dplyr::filter(.data$covariateId %in% covariates) + + rowIds <- tinyPlpData$covariateData$covariates %>% + dplyr::pull(.data$rowId) %>% + unique() + tinyPlpData$cohorts <- plpData$cohorts %>% + dplyr::filter(.data$rowId %in% rowIds) + + attributes(tinyPlpData$covariateData)$metaData <- + attributes(plpData$covariateData)$metaData class(tinyPlpData$covariateData) <- class(plpData$covariateData) attributes(tinyPlpData)$metaData <- attributes(plpData)$metaData class(tinyPlpData) <- class(plpData) @@ -42,22 +50,24 @@ createData <- function(observations, features, totalFeatures, columnId <- sample(1:totalFeatures, observations * features, replace = TRUE) }) covariateValue <- rep(1, observations * features) - covariates <- data.frame(rowId = rowId, columnId = columnId, covariateValue = covariateValue) + covariates <- data.frame(rowId = rowId, columnId = columnId, covariateValue = covariateValue) if (numCovs) { numRow <- 1:observations numCol <- rep(totalFeatures + 1, observations) withr::with_seed(seed, { numVal <- runif(observations) }) - numCovariates <- data.frame(rowId = as.integer(numRow), - columnId = as.integer(numCol), - covariateValue = numVal) + numCovariates <- data.frame( + rowId = as.integer(numRow), + columnId = as.integer(numCol), + covariateValue = numVal + ) covariates <- rbind(covariates, numCovariates) } withr::with_seed(seed, { - labels <- as.numeric(sample(0:1, observations, replace = TRUE, prob = c(1 - outcomeRate, outcomeRate))) + labels <- as.numeric(sample(0:1, observations, replace = TRUE, prob = c(1 - outcomeRate, outcomeRate))) }) - - data <- list(covariates = covariates, labels = labels) + + data <- list(covariates = covariates, labels = labels) return(data) } diff --git a/tests/testthat/test-featureEngineering.R b/tests/testthat/test-featureEngineering.R index 8ae88f5ce..8f62cd7ea 100644 --- a/tests/testthat/test-featureEngineering.R +++ b/tests/testthat/test-featureEngineering.R @@ -18,138 +18,137 @@ library("testthat") context("FeatureEngineering") -testFEFun <- function(type = 'none'){ - +testFEFun <- function(type = "none") { result <- createFeatureEngineeringSettings(type = type) - + return(result) } - + test_that("createFeatureEngineeringSettings correct class", { - featureEngineeringSettings <- testFEFun() - - expect_is(featureEngineeringSettings, 'featureEngineeringSettings') - - checkFun <- 'sameData' # this is the only option at the moment, edit this when more are added + + expect_is(featureEngineeringSettings, "featureEngineeringSettings") + + checkFun <- "sameData" # this is the only option at the moment, edit this when more are added expect_equal(attr(featureEngineeringSettings, "fun"), checkFun) - }) -testUniFun <- function(k = 100){ - +testUniFun <- function(k = 100) { result <- createUnivariateFeatureSelection(k = k) - + return(result) } test_that("createUnivariateFeatureSelection correct class", { - k <- sample(1000,1) + k <- sample(1000, 1) featureEngineeringSettings <- testUniFun(k = k) - - expect_is(featureEngineeringSettings, 'featureEngineeringSettings') + + expect_is(featureEngineeringSettings, "featureEngineeringSettings") expect_equal(featureEngineeringSettings$k, k) - expect_equal(attr(featureEngineeringSettings, "fun"), 'univariateFeatureSelection') - - expect_error(testUniFun(k = 'ffdff')) + expect_equal(attr(featureEngineeringSettings, "fun"), "univariateFeatureSelection") + + expect_error(testUniFun(k = "ffdff")) expect_error(testUniFun(k = NULL)) expect_error(testUniFun(k = -1)) }) test_that("univariateFeatureSelection", { - - k <- 20+sample(10,1) + k <- 20 + sample(10, 1) featureEngineeringSettings <- testUniFun(k = k) newTrainData <- copyTrainData(trainData) - - trainDataCovariateSize <- newTrainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() - + + trainDataCovariateSize <- newTrainData$covariateData$covariates %>% + dplyr::tally() %>% + dplyr::pull() + reducedTrainData <- univariateFeatureSelection( - trainData = newTrainData, + trainData = newTrainData, featureEngineeringSettings = featureEngineeringSettings, covariateIdsInclude = NULL - ) - - newDataCovariateSize <- reducedTrainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() + ) + + newDataCovariateSize <- reducedTrainData$covariateData$covariates %>% + dplyr::tally() %>% + dplyr::pull() expect_true(newDataCovariateSize <= trainDataCovariateSize) - + # expect k many covariates left - expect_equal(k,reducedTrainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull()) - + expect_equal(k, reducedTrainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull()) }) test_that("createRandomForestFeatureSelection correct class", { - ntreesTest <- sample(1000,1) - maxDepthTest <- sample(20,1) + ntreesTest <- sample(1000, 1) + maxDepthTest <- sample(20, 1) featureEngineeringSettings <- createRandomForestFeatureSelection( - ntrees = ntreesTest, + ntrees = ntreesTest, maxDepth = maxDepthTest - ) - - expect_is(featureEngineeringSettings, 'featureEngineeringSettings') + ) + + expect_is(featureEngineeringSettings, "featureEngineeringSettings") expect_equal(featureEngineeringSettings$ntrees, ntreesTest) expect_equal(featureEngineeringSettings$max_depth, maxDepthTest) - expect_equal(attr(featureEngineeringSettings, "fun"), 'randomForestFeatureSelection') - + expect_equal(attr(featureEngineeringSettings, "fun"), "randomForestFeatureSelection") + # error due to params expect_error( createRandomForestFeatureSelection( - ntrees = -1, + ntrees = -1, maxDepth = maxDepthTest ) ) - + expect_error( createRandomForestFeatureSelection( - ntrees = 'dfdfd', + ntrees = "dfdfd", maxDepth = maxDepthTest ) ) - + expect_error( createRandomForestFeatureSelection( - ntrees = 50, - maxDepth = 'maxDepthTest' + ntrees = 50, + maxDepth = "maxDepthTest" ) ) - + expect_error( createRandomForestFeatureSelection( - ntrees = 50, + ntrees = 50, maxDepth = -1 ) ) - }) test_that("randomForestFeatureSelection", { - - ntreesTest <- sample(1000,1) - maxDepthTest <- sample(20,1) + ntreesTest <- sample(1000, 1) + maxDepthTest <- sample(20, 1) featureEngineeringSettings <- createRandomForestFeatureSelection( - ntrees = ntreesTest, + ntrees = ntreesTest, maxDepth = maxDepthTest ) - + newTrainData <- copyTrainData(trainData) - trainDataCovariateSize <- newTrainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() - + trainDataCovariateSize <- newTrainData$covariateData$covariates %>% + dplyr::tally() %>% + dplyr::pull() + reducedTrainData <- randomForestFeatureSelection( - trainData = newTrainData, + trainData = newTrainData, featureEngineeringSettings = featureEngineeringSettings, covariateIdsInclude = NULL ) - - newDataCovariateSize <- reducedTrainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() - expect_true(newDataCovariateSize < trainDataCovariateSize) + newDataCovariateSize <- reducedTrainData$covariateData$covariates %>% + dplyr::tally() %>% + dplyr::pull() + expect_true(newDataCovariateSize < trainDataCovariateSize) }) test_that("featureSelection is applied on test_data", { @@ -157,136 +156,141 @@ test_that("featureSelection is applied on test_data", { featureEngineeringSettings <- testUniFun(k = k) newTrainData <- copyTrainData(trainData) newTrainData <- univariateFeatureSelection( - trainData = newTrainData, + trainData = newTrainData, featureEngineeringSettings = featureEngineeringSettings, covariateIdsInclude = NULL ) - + modelSettings <- setLassoLogisticRegression() - + # added try catch due to model sometimes not fitting plpModel <- tryCatch( - {fitPlp(newTrainData, modelSettings, analysisId='FE')}, - error = function(e){return(NULL)} + { + fitPlp(newTrainData, modelSettings, analysisId = "FE") + }, + error = function(e) { + return(NULL) + } ) - - if(!is.null(plpModel)){ # if the model fit then check this + + if (!is.null(plpModel)) { # if the model fit then check this prediction <- predictPlp(plpModel, testData, population) - expect_true(attr(prediction, 'metaData')$featureEngineering) + expect_true(attr(prediction, "metaData")$featureEngineering) } }) test_that("createSplineSettings correct class", { - featureEngineeringSettings <- createSplineSettings( - continousCovariateId = 12, + continousCovariateId = 12, knots = 4 - ) - - expect_is(featureEngineeringSettings, 'featureEngineeringSettings') + ) + + expect_is(featureEngineeringSettings, "featureEngineeringSettings") expect_equal(featureEngineeringSettings$knots, 4) expect_equal(featureEngineeringSettings$continousCovariateId, 12) - expect_equal(attr(featureEngineeringSettings, "fun"), 'splineCovariates') - - expect_error(createSplineSettings(knots = 'ffdff')) + expect_equal(attr(featureEngineeringSettings, "fun"), "splineCovariates") + + expect_error(createSplineSettings(knots = "ffdff")) expect_error(createSplineSettings(knots = NULL)) }) test_that("createSplineSettings correct class", { - knots <- 4 featureEngineeringSettings <- createSplineSettings( - continousCovariateId = 12101, + continousCovariateId = 12101, knots = knots ) - + trainData <- simulatePlpData(plpDataSimulationProfile, n = 200) - + N <- 50 trainData$covariateData$covariates <- data.frame( rowId = sample(trainData$cohorts$rowId, N), covariateId = rep(12101, N), covariateValue = sample(10, N, replace = T) ) - + trainData$covariateData$analysisRef <- data.frame( analysisId = 101, - analysisName = 'cond', - domainId = 'madeup', + analysisName = "cond", + domainId = "madeup", startDay = 0, endDay = 0, - isBinary = 'N', - missingMeansZero = 'N' + isBinary = "N", + missingMeansZero = "N" ) - + trainData$covariateData$covariateRef <- data.frame( covariateId = 12101, - covariateName = 'test', + covariateName = "test", analysisId = 101, conceptId = 1 ) - -newData <- splineCovariates( - trainData = trainData, + + newData <- splineCovariates( + trainData = trainData, featureEngineeringSettings = featureEngineeringSettings -) + ) -testthat::expect_true(1 < nrow(as.data.frame(newData$covariateData$analysisRef))) -testthat::expect_true((knots+1) == nrow(as.data.frame(newData$covariateData$covariateRef))) -testthat::expect_true((knots+1) == length(table(as.data.frame(newData$covariateData$covariates)$covariateId))) - + testthat::expect_true(1 < nrow(as.data.frame(newData$covariateData$analysisRef))) + testthat::expect_true((knots + 1) == nrow(as.data.frame(newData$covariateData$covariateRef))) + testthat::expect_true((knots + 1) == length(table(as.data.frame(newData$covariateData$covariates)$covariateId))) }) test_that("createStratifiedImputationSettings correct class", { - + ageSplits <- c(33, 38, 42) featureEngineeringSettings <- createStratifiedImputationSettings( - covariateId = 12101, - ageSplits = c(20,50,70) - ) - - trainData <- simulatePlpData(plpDataSimulationProfile, n = 200) - - N <- 50 - trainData$covariateData$covariates <- data.frame( - rowId = sample(trainData$cohorts$rowId, N), - covariateId = rep(12101, N), - covariateValue = sample(10, N, replace = T) + covariateId = 12101, + ageSplits = ageSplits ) - trainData$covariateData$analysisRef <- data.frame( + numSubjects <- nanoData$covariateData$covariates %>% + dplyr::pull(.data$rowId) %>% + dplyr::n_distinct() + Andromeda::appendToTable(nanoData$covariateData$covariates, data.frame( + rowId = sample(nanoData$cohorts$rowId, floor(numSubjects / 2)), + covariateId = rep(12101, floor(numSubjects / 2)), + covariateValue = sample(10, floor(numSubjects / 2), replace = TRUE) + )) + + Andromeda::appendToTable(nanoData$covariateData$analysisRef, data.frame( analysisId = 101, - analysisName = 'cond', - domainId = 'madeup', + analysisName = "cond", + domainId = "madeup", startDay = 0, endDay = 0, - isBinary = 'N', - missingMeansZero = 'N' - ) - - trainData$covariateData$covariateRef <- data.frame( + isBinary = "N", + missingMeansZero = "N" + )) + + Andromeda::appendToTable(nanoData$covariateData$covariateRef, data.frame( covariateId = 12101, - covariateName = 'test', + covariateName = "test", analysisId = 101, conceptId = 1 - ) - + )) + stratifiedMeans <- calculateStratifiedMeans( - trainData = trainData, + trainData = nanoData, featureEngineeringSettings = featureEngineeringSettings ) - - testthat::expect_true(nrow(stratifiedMeans) == 8) - -imputedData <- imputeMissingMeans( - trainData = trainData, + + testthat::expect_true(nrow(stratifiedMeans) == 8) + + imputedData <- imputeMissingMeans( + trainData = nanoData, covariateId = 12101, - ageSplits = c(20,50,70), + ageSplits = ageSplits, stratifiedMeans = stratifiedMeans -) + ) -testthat::expect_true( - nrow(as.data.frame(imputedData$covariateData$covariates)) == 200 -) + testthat::expect_equal( + imputedData$covariateData$covariates %>% + dplyr::filter(.data$covariateId == 12101) %>% + dplyr::pull(.data$rowId) %>% + dplyr::n_distinct(), + numSubjects + ) +}) -}) \ No newline at end of file diff --git a/tests/testthat/test-population.R b/tests/testthat/test-population.R index 6add83fee..e74098187 100644 --- a/tests/testthat/test-population.R +++ b/tests/testthat/test-population.R @@ -497,7 +497,7 @@ test_that("population creation parameters", { testthat::test_that("Providing an existing population and skipping population creation works", { popSize <- 400 - newPopulation <- population[sample.int(nrow.default(population), popSize), ] + newPopulation <- population[sample.int(nrow(population), popSize), ] tinyPlpData$population <- newPopulation @@ -517,9 +517,9 @@ testthat::test_that("Providing an existing population and skipping population cr ) trainPredictions <- plpResults$prediction %>% - dplyr::filter(.data$evaluationType == "Train") %>% nrow.default() + dplyr::filter(.data$evaluationType == "Train") %>% nrow() testPredictions <- plpResults$prediction %>% - dplyr::filter(.data$evaluationType == "Test") %>% nrow.default() + dplyr::filter(.data$evaluationType == "Test") %>% nrow() expect_equal(popSize, trainPredictions + testPredictions) })