From 0a65fc553df59d69bb2adaf17490fef170e1834d Mon Sep 17 00:00:00 2001
From: egillax <egillax@gmail.com>
Date: Mon, 18 Nov 2024 16:23:53 +0100
Subject: [PATCH 1/2] add existing splitter and tests

---
 R/DataSplitting.R                   | 35 ++++++++++++++++++++++++++---
 tests/testthat/test-dataSplitting.R | 35 +++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 3 deletions(-)

diff --git a/R/DataSplitting.R b/R/DataSplitting.R
index 0c2374285..97960a7e8 100644
--- a/R/DataSplitting.R
+++ b/R/DataSplitting.R
@@ -93,6 +93,26 @@ createDefaultSplitSetting <- function(testFraction = 0.25,
   return(splitSettings)
 }
 
+#' Create the settings for defining how the plpData are split into
+#' test/validation/train sets using an existing split - good to use for 
+#' reproducing results from a different run 
+#' @param splitIds (data.frame) A data frame with rowId and index columns of 
+#' type integer/numeric. Index is -1 for test set, positive integer for train 
+#' set folds
+#' @return An object of class \code{splitSettings}
+#' @export
+createExistingSplitSettings <- function(splitIds) {
+  checkIsClass(splitIds, "data.frame")
+  checkColumnNames(splitIds, c("rowId", "index"))
+  checkIsClass(splitIds$rowId, c("integer", "numeric"))
+  checkIsClass(splitIds$index, c("integer", "numeric"))
+  checkHigherEqual(splitIds$index, -1)
+
+  splitSettings <- list(splitIds = splitIds)
+  attr(splitSettings, "fun") <- "existingSplitter"
+  class(splitSettings) <- "splitSettings"
+  return(splitSettings)
+}
 
 
 #' Split the plpData into test/train sets using a splitting settings of class 
@@ -561,7 +581,16 @@ checkInputsSplit <- function(test, train, nfold, seed) {
   ParallelLogger::logDebug(paste0("nfold: ", nfold))
   checkIsClass(nfold, c("numeric", "integer"))
   checkHigher(nfold, 1)
-  
-  ParallelLogger::logInfo(paste0('seed: ', seed))
-  checkIsClass(seed, c('numeric','integer'))
+
+  ParallelLogger::logInfo(paste0("seed: ", seed))
+  checkIsClass(seed, c("numeric", "integer"))
+}
+
+existingSplitter <- function(population, splitSettings) {
+  splitIds <- splitSettings$splitIds
+  # check all row Ids are in population
+  if (sum(!splitIds$rowId %in% population$rowId) > 0) {
+    stop("Not all rowIds in splitIds are in the population")
+  }
+  return(splitIds)
 }
diff --git a/tests/testthat/test-dataSplitting.R b/tests/testthat/test-dataSplitting.R
index b8ce628bb..64d2f5cd9 100644
--- a/tests/testthat/test-dataSplitting.R
+++ b/tests/testthat/test-dataSplitting.R
@@ -417,5 +417,40 @@ test_that("Data splitting by subject", {
 # test that no subject is not assigned a fold
   expect_equal(sum(test$index==0), 0)
 
+  # test that no subject is not assigned a fold
+  expect_equal(sum(test$index == 0), 0)
+})
+
+test_that("Existing data splitter works", {
+  # split by age
+  age <- population$ageYear
+  # create empty index same lengths as age
+  index <- rep(0, length(age))
+  index[age > 43] <- -1 # test set
+  index[age <= 35] <- 1 # train fold 1
+  index[age > 35 & age <= 43] <- 2 # train fold 2
+  splitIds <- data.frame(rowId = population$rowId, index = index)
+  splitSettings <- createExistingSplitSettings(splitIds)
+  ageSplit <- splitData(
+    plpData = plpData,
+    population = population,
+    splitSettings = splitSettings
+  )
+  
+  # test only old people in test
+  expect_equal(
+    length(ageSplit$Test$labels$rowId),
+    sum(age > 43)
+  )
+  # only young people in train
+  expect_equal(
+    length(ageSplit$Train$labels$rowId),
+    sum(age <= 43)
+  )
+  # no overlap
+  expect_equal(
+    length(intersect(ageSplit$Test$labels$rowId, ageSplit$Train$labels$rowId)),
+    0
+  )
 
 })

From 7bddc3871bf4385241a0db892956e72c47ffa655 Mon Sep 17 00:00:00 2001
From: egillax <egillax@gmail.com>
Date: Mon, 18 Nov 2024 21:09:33 +0100
Subject: [PATCH 2/2] docs

---
 NAMESPACE                          |  1 +
 man/createDefaultSplitSetting.Rd   | 42 +++++++++++++++++++-----------
 man/createExistingSplitSettings.Rd | 23 ++++++++++++++++
 man/splitData.Rd                   | 36 ++++++++++++++++---------
 4 files changed, 74 insertions(+), 28 deletions(-)
 create mode 100644 man/createExistingSplitSettings.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 5ce71cbba..ad98ff62d 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -22,6 +22,7 @@ export(createDatabaseSchemaSettings)
 export(createDefaultExecuteSettings)
 export(createDefaultSplitSetting)
 export(createExecuteSettings)
+export(createExistingSplitSettings)
 export(createFeatureEngineeringSettings)
 export(createGlmModel)
 export(createLearningCurve)
diff --git a/man/createDefaultSplitSetting.Rd b/man/createDefaultSplitSetting.Rd
index b9020bd45..e63e74375 100644
--- a/man/createDefaultSplitSetting.Rd
+++ b/man/createDefaultSplitSetting.Rd
@@ -2,8 +2,9 @@
 % Please edit documentation in R/DataSplitting.R
 \name{createDefaultSplitSetting}
 \alias{createDefaultSplitSetting}
-\title{Create the settings for defining how the plpData are split into test/validation/train sets using 
-default splitting functions (either random stratified by outcome, time or subject splitting)}
+\title{Create the settings for defining how the plpData are split into
+test/validation/train sets using default splitting functions 
+(either random stratified by outcome, time or subject splitting)}
 \usage{
 createDefaultSplitSetting(
   testFraction = 0.25,
@@ -14,28 +15,39 @@ createDefaultSplitSetting(
 )
 }
 \arguments{
-\item{testFraction}{(numeric) A real number between 0 and 1 indicating the test set fraction of the data}
+\item{testFraction}{(numeric) A real number between 0 and 1
+indicating the test set fraction of the data}
 
-\item{trainFraction}{(numeric) A real number between 0 and 1 indicating the train set fraction of the data.
-If not set train is equal to 1 - test}
+\item{trainFraction}{(numeric) A real number between 0 and 1 indicating the 
+train set fraction of the data. If not set train is equal to 1 - test}
 
-\item{splitSeed}{(numeric) A seed to use when splitting the data for reproducibility (if not set a random number will be generated)}
+\item{splitSeed}{(numeric) A seed to use when splitting the data for 
+reproducibility (if not set a random number will be generated)}
 
-\item{nfold}{(numeric) An integer > 1 specifying the number of folds used in cross validation}
+\item{nfold}{(numeric) An integer > 1 specifying the number of
+folds used in cross validation}
 
-\item{type}{(character) Choice of:  \itemize{
-\item'stratified' Each data point is randomly assigned into the test or a train fold set but this is done stratified such that the outcome rate is consistent in each partition 
-\item'time' Older data are assigned into the training set and newer data are assigned into the test set
-\item'subject' Data are partitioned by subject, if a subject is in the data more than once, all the data points for the subject are assigned either into the test data or into the train data (not both).
-}}
+\item{type}{(character) Choice of: \itemize{
+                                     \item'stratified' Each data point is 
+randomly assigned into the test or a train fold set but this is done 
+stratified such that the outcome rate is consistent in each partition
+                                     \item'time' Older data are assigned 
+into the training set and newer data are assigned into the test set
+                                     \item'subject' Data are partitioned by
+subject, if a subject is in the data more than once, all the data points for 
+the subject are assigned either into the test data or into the train data 
+(not both).
+                                        }}
 }
 \value{
 An object of class \code{splitSettings}
 }
 \description{
-Create the settings for defining how the plpData are split into test/validation/train sets using 
-default splitting functions (either random stratified by outcome, time or subject splitting)
+Create the settings for defining how the plpData are split into
+test/validation/train sets using default splitting functions 
+(either random stratified by outcome, time or subject splitting)
 }
 \details{
-Returns an object of class \code{splitSettings} that specifies the splitting function that will be called and the settings
+Returns an object of class \code{splitSettings} that specifies the 
+splitting function that will be called and the settings
 }
diff --git a/man/createExistingSplitSettings.Rd b/man/createExistingSplitSettings.Rd
new file mode 100644
index 000000000..18ffda495
--- /dev/null
+++ b/man/createExistingSplitSettings.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/DataSplitting.R
+\name{createExistingSplitSettings}
+\alias{createExistingSplitSettings}
+\title{Create the settings for defining how the plpData are split into
+test/validation/train sets using an existing split - good to use for 
+reproducing results from a different run}
+\usage{
+createExistingSplitSettings(splitIds)
+}
+\arguments{
+\item{splitIds}{(data.frame) A data frame with rowId and index columns of 
+type integer/numeric. Index is -1 for test set, positive integer for train 
+set folds}
+}
+\value{
+An object of class \code{splitSettings}
+}
+\description{
+Create the settings for defining how the plpData are split into
+test/validation/train sets using an existing split - good to use for 
+reproducing results from a different run
+}
diff --git a/man/splitData.Rd b/man/splitData.Rd
index 4c03f61b8..a4368cf4e 100644
--- a/man/splitData.Rd
+++ b/man/splitData.Rd
@@ -2,7 +2,8 @@
 % Please edit documentation in R/DataSplitting.R
 \name{splitData}
 \alias{splitData}
-\title{Split the plpData into test/train sets using a splitting settings of class \code{splitSettings}}
+\title{Split the plpData into test/train sets using a splitting settings of class 
+\code{splitSettings}}
 \usage{
 splitData(
   plpData = plpData,
@@ -11,29 +12,38 @@ splitData(
 )
 }
 \arguments{
-\item{plpData}{An object of type \code{plpData} - the patient level prediction
-data extracted from the CDM.}
+\item{plpData}{An object of type \code{plpData} - the patient level 
+prediction data extracted from the CDM.}
 
-\item{population}{The population created using \code{createStudyPopulation} that define who will be used to develop the model}
+\item{population}{The population created using \code{createStudyPopulation} 
+that define who will be used to develop the model}
 
-\item{splitSettings}{An object of type \code{splitSettings} specifying the split - the default can be created using \code{createDefaultSplitSetting}}
+\item{splitSettings}{An object of type \code{splitSettings} specifying the 
+split - the default can be created using \code{createDefaultSplitSetting}}
 }
 \value{
 An object of class \code{splitSettings}
 }
 \description{
-Split the plpData into test/train sets using a splitting settings of class \code{splitSettings}
+Split the plpData into test/train sets using a splitting settings of class 
+\code{splitSettings}
 }
 \details{
-Returns a list containing the training data (Train) and optionally the test data (Test).  Train is an Andromeda object containing
-\itemize{\item covariates: a table (rowId, covariateId, covariateValue) containing the covariates for each data point in the train data
+Returns a list containing the training data (Train) and optionally the test
+data (Test). Train is an Andromeda object containing
+\itemize{\item covariates: a table (rowId, covariateId, covariateValue)
+containing the covariates for each data point in the train data
          \item covariateRef: a table with the covariate information
-         \item labels: a table (rowId, outcomeCount, ...) for each data point in the train data (outcomeCount is the class label) 
-         \item folds: a table (rowId, index) specifying which training fold each data point is in.
-         } 
+         \item labels: a table (rowId, outcomeCount, ...) for each data point
+in the train data (outcomeCount is the class label)
+         \item folds: a table (rowId, index) specifying which training 
+fold each data point is in.
+         }
 Test is an Andromeda object containing
-\itemize{\item covariates: a table (rowId, covariateId, covariateValue) containing the covariates for each data point in the test data 
+\itemize{\item covariates: a table (rowId, covariateId, covariateValue)
+containing the covariates for each data point in the test data
          \item covariateRef: a table with the covariate information
-         \item labels: a table (rowId, outcomeCount, ...) for each data point in the test data (outcomeCount is the class label) 
+         \item labels: a table (rowId, outcomeCount, ...) for each data
+point in the test data (outcomeCount is the class label)
          }
 }