From 0a65fc553df59d69bb2adaf17490fef170e1834d Mon Sep 17 00:00:00 2001 From: egillax Date: Mon, 18 Nov 2024 16:23:53 +0100 Subject: [PATCH 1/2] add existing splitter and tests --- R/DataSplitting.R | 35 ++++++++++++++++++++++++++--- tests/testthat/test-dataSplitting.R | 35 +++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 3 deletions(-) diff --git a/R/DataSplitting.R b/R/DataSplitting.R index 0c2374285..97960a7e8 100644 --- a/R/DataSplitting.R +++ b/R/DataSplitting.R @@ -93,6 +93,26 @@ createDefaultSplitSetting <- function(testFraction = 0.25, return(splitSettings) } +#' Create the settings for defining how the plpData are split into +#' test/validation/train sets using an existing split - good to use for +#' reproducing results from a different run +#' @param splitIds (data.frame) A data frame with rowId and index columns of +#' type integer/numeric. Index is -1 for test set, positive integer for train +#' set folds +#' @return An object of class \code{splitSettings} +#' @export +createExistingSplitSettings <- function(splitIds) { + checkIsClass(splitIds, "data.frame") + checkColumnNames(splitIds, c("rowId", "index")) + checkIsClass(splitIds$rowId, c("integer", "numeric")) + checkIsClass(splitIds$index, c("integer", "numeric")) + checkHigherEqual(splitIds$index, -1) + + splitSettings <- list(splitIds = splitIds) + attr(splitSettings, "fun") <- "existingSplitter" + class(splitSettings) <- "splitSettings" + return(splitSettings) +} #' Split the plpData into test/train sets using a splitting settings of class @@ -561,7 +581,16 @@ checkInputsSplit <- function(test, train, nfold, seed) { ParallelLogger::logDebug(paste0("nfold: ", nfold)) checkIsClass(nfold, c("numeric", "integer")) checkHigher(nfold, 1) - - ParallelLogger::logInfo(paste0('seed: ', seed)) - checkIsClass(seed, c('numeric','integer')) + + ParallelLogger::logInfo(paste0("seed: ", seed)) + checkIsClass(seed, c("numeric", "integer")) +} + +existingSplitter <- function(population, splitSettings) { + splitIds <- splitSettings$splitIds + # check all row Ids are in population + if (sum(!splitIds$rowId %in% population$rowId) > 0) { + stop("Not all rowIds in splitIds are in the population") + } + return(splitIds) } diff --git a/tests/testthat/test-dataSplitting.R b/tests/testthat/test-dataSplitting.R index b8ce628bb..64d2f5cd9 100644 --- a/tests/testthat/test-dataSplitting.R +++ b/tests/testthat/test-dataSplitting.R @@ -417,5 +417,40 @@ test_that("Data splitting by subject", { # test that no subject is not assigned a fold expect_equal(sum(test$index==0), 0) + # test that no subject is not assigned a fold + expect_equal(sum(test$index == 0), 0) +}) + +test_that("Existing data splitter works", { + # split by age + age <- population$ageYear + # create empty index same lengths as age + index <- rep(0, length(age)) + index[age > 43] <- -1 # test set + index[age <= 35] <- 1 # train fold 1 + index[age > 35 & age <= 43] <- 2 # train fold 2 + splitIds <- data.frame(rowId = population$rowId, index = index) + splitSettings <- createExistingSplitSettings(splitIds) + ageSplit <- splitData( + plpData = plpData, + population = population, + splitSettings = splitSettings + ) + + # test only old people in test + expect_equal( + length(ageSplit$Test$labels$rowId), + sum(age > 43) + ) + # only young people in train + expect_equal( + length(ageSplit$Train$labels$rowId), + sum(age <= 43) + ) + # no overlap + expect_equal( + length(intersect(ageSplit$Test$labels$rowId, ageSplit$Train$labels$rowId)), + 0 + ) }) From 7bddc3871bf4385241a0db892956e72c47ffa655 Mon Sep 17 00:00:00 2001 From: egillax Date: Mon, 18 Nov 2024 21:09:33 +0100 Subject: [PATCH 2/2] docs --- NAMESPACE | 1 + man/createDefaultSplitSetting.Rd | 42 +++++++++++++++++++----------- man/createExistingSplitSettings.Rd | 23 ++++++++++++++++ man/splitData.Rd | 36 ++++++++++++++++--------- 4 files changed, 74 insertions(+), 28 deletions(-) create mode 100644 man/createExistingSplitSettings.Rd diff --git a/NAMESPACE b/NAMESPACE index 5ce71cbba..ad98ff62d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,6 +22,7 @@ export(createDatabaseSchemaSettings) export(createDefaultExecuteSettings) export(createDefaultSplitSetting) export(createExecuteSettings) +export(createExistingSplitSettings) export(createFeatureEngineeringSettings) export(createGlmModel) export(createLearningCurve) diff --git a/man/createDefaultSplitSetting.Rd b/man/createDefaultSplitSetting.Rd index b9020bd45..e63e74375 100644 --- a/man/createDefaultSplitSetting.Rd +++ b/man/createDefaultSplitSetting.Rd @@ -2,8 +2,9 @@ % Please edit documentation in R/DataSplitting.R \name{createDefaultSplitSetting} \alias{createDefaultSplitSetting} -\title{Create the settings for defining how the plpData are split into test/validation/train sets using -default splitting functions (either random stratified by outcome, time or subject splitting)} +\title{Create the settings for defining how the plpData are split into +test/validation/train sets using default splitting functions +(either random stratified by outcome, time or subject splitting)} \usage{ createDefaultSplitSetting( testFraction = 0.25, @@ -14,28 +15,39 @@ createDefaultSplitSetting( ) } \arguments{ -\item{testFraction}{(numeric) A real number between 0 and 1 indicating the test set fraction of the data} +\item{testFraction}{(numeric) A real number between 0 and 1 +indicating the test set fraction of the data} -\item{trainFraction}{(numeric) A real number between 0 and 1 indicating the train set fraction of the data. -If not set train is equal to 1 - test} +\item{trainFraction}{(numeric) A real number between 0 and 1 indicating the +train set fraction of the data. If not set train is equal to 1 - test} -\item{splitSeed}{(numeric) A seed to use when splitting the data for reproducibility (if not set a random number will be generated)} +\item{splitSeed}{(numeric) A seed to use when splitting the data for +reproducibility (if not set a random number will be generated)} -\item{nfold}{(numeric) An integer > 1 specifying the number of folds used in cross validation} +\item{nfold}{(numeric) An integer > 1 specifying the number of +folds used in cross validation} -\item{type}{(character) Choice of: \itemize{ -\item'stratified' Each data point is randomly assigned into the test or a train fold set but this is done stratified such that the outcome rate is consistent in each partition -\item'time' Older data are assigned into the training set and newer data are assigned into the test set -\item'subject' Data are partitioned by subject, if a subject is in the data more than once, all the data points for the subject are assigned either into the test data or into the train data (not both). -}} +\item{type}{(character) Choice of: \itemize{ + \item'stratified' Each data point is +randomly assigned into the test or a train fold set but this is done +stratified such that the outcome rate is consistent in each partition + \item'time' Older data are assigned +into the training set and newer data are assigned into the test set + \item'subject' Data are partitioned by +subject, if a subject is in the data more than once, all the data points for +the subject are assigned either into the test data or into the train data +(not both). + }} } \value{ An object of class \code{splitSettings} } \description{ -Create the settings for defining how the plpData are split into test/validation/train sets using -default splitting functions (either random stratified by outcome, time or subject splitting) +Create the settings for defining how the plpData are split into +test/validation/train sets using default splitting functions +(either random stratified by outcome, time or subject splitting) } \details{ -Returns an object of class \code{splitSettings} that specifies the splitting function that will be called and the settings +Returns an object of class \code{splitSettings} that specifies the +splitting function that will be called and the settings } diff --git a/man/createExistingSplitSettings.Rd b/man/createExistingSplitSettings.Rd new file mode 100644 index 000000000..18ffda495 --- /dev/null +++ b/man/createExistingSplitSettings.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DataSplitting.R +\name{createExistingSplitSettings} +\alias{createExistingSplitSettings} +\title{Create the settings for defining how the plpData are split into +test/validation/train sets using an existing split - good to use for +reproducing results from a different run} +\usage{ +createExistingSplitSettings(splitIds) +} +\arguments{ +\item{splitIds}{(data.frame) A data frame with rowId and index columns of +type integer/numeric. Index is -1 for test set, positive integer for train +set folds} +} +\value{ +An object of class \code{splitSettings} +} +\description{ +Create the settings for defining how the plpData are split into +test/validation/train sets using an existing split - good to use for +reproducing results from a different run +} diff --git a/man/splitData.Rd b/man/splitData.Rd index 4c03f61b8..a4368cf4e 100644 --- a/man/splitData.Rd +++ b/man/splitData.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/DataSplitting.R \name{splitData} \alias{splitData} -\title{Split the plpData into test/train sets using a splitting settings of class \code{splitSettings}} +\title{Split the plpData into test/train sets using a splitting settings of class +\code{splitSettings}} \usage{ splitData( plpData = plpData, @@ -11,29 +12,38 @@ splitData( ) } \arguments{ -\item{plpData}{An object of type \code{plpData} - the patient level prediction -data extracted from the CDM.} +\item{plpData}{An object of type \code{plpData} - the patient level +prediction data extracted from the CDM.} -\item{population}{The population created using \code{createStudyPopulation} that define who will be used to develop the model} +\item{population}{The population created using \code{createStudyPopulation} +that define who will be used to develop the model} -\item{splitSettings}{An object of type \code{splitSettings} specifying the split - the default can be created using \code{createDefaultSplitSetting}} +\item{splitSettings}{An object of type \code{splitSettings} specifying the +split - the default can be created using \code{createDefaultSplitSetting}} } \value{ An object of class \code{splitSettings} } \description{ -Split the plpData into test/train sets using a splitting settings of class \code{splitSettings} +Split the plpData into test/train sets using a splitting settings of class +\code{splitSettings} } \details{ -Returns a list containing the training data (Train) and optionally the test data (Test). Train is an Andromeda object containing -\itemize{\item covariates: a table (rowId, covariateId, covariateValue) containing the covariates for each data point in the train data +Returns a list containing the training data (Train) and optionally the test +data (Test). Train is an Andromeda object containing +\itemize{\item covariates: a table (rowId, covariateId, covariateValue) +containing the covariates for each data point in the train data \item covariateRef: a table with the covariate information - \item labels: a table (rowId, outcomeCount, ...) for each data point in the train data (outcomeCount is the class label) - \item folds: a table (rowId, index) specifying which training fold each data point is in. - } + \item labels: a table (rowId, outcomeCount, ...) for each data point +in the train data (outcomeCount is the class label) + \item folds: a table (rowId, index) specifying which training +fold each data point is in. + } Test is an Andromeda object containing -\itemize{\item covariates: a table (rowId, covariateId, covariateValue) containing the covariates for each data point in the test data +\itemize{\item covariates: a table (rowId, covariateId, covariateValue) +containing the covariates for each data point in the test data \item covariateRef: a table with the covariate information - \item labels: a table (rowId, outcomeCount, ...) for each data point in the test data (outcomeCount is the class label) + \item labels: a table (rowId, outcomeCount, ...) for each data +point in the test data (outcomeCount is the class label) } }