From 3f1a292285ef6ac4e5ff6039bf9eca0167a2e3a2 Mon Sep 17 00:00:00 2001
From: egillax <egillax@gmail.com>
Date: Mon, 18 Nov 2024 16:22:21 +0100
Subject: [PATCH] linting and docs tests for datasplitting and fixes in
 paramcheck

---
 R/ParamChecks.R                     |   8 +-
 tests/testthat/test-dataSplitting.R | 367 ++++++++++++++--------------
 2 files changed, 189 insertions(+), 186 deletions(-)

diff --git a/R/ParamChecks.R b/R/ParamChecks.R
index e62ab216b..7b8ef29e1 100644
--- a/R/ParamChecks.R
+++ b/R/ParamChecks.R
@@ -29,7 +29,7 @@ checkBoolean <- function(parameter) {
 
 checkHigherEqual <- function(parameter, value) {
   name <- deparse(substitute(parameter))
-  if (!is.numeric(parameter) || parameter < value) {
+  if (!is.numeric(parameter) || any(parameter < value)) {
     ParallelLogger::logError(paste0(name, " needs to be >= ", value))
     stop(paste0(name, " needs to be >= ", value))
   }
@@ -38,7 +38,7 @@ checkHigherEqual <- function(parameter, value) {
 
 checkLowerEqual <- function(parameter, value) {
   name <- deparse(substitute(parameter))
-  if (!is.numeric(parameter) || parameter > value) {
+  if (!is.numeric(parameter) || any(parameter > value)) {
     ParallelLogger::logError(paste0(name, " needs to be <= ", value))
     stop(paste0(name, " needs to be <= ", value))
   }
@@ -47,7 +47,7 @@ checkLowerEqual <- function(parameter, value) {
 
 checkHigher <- function(parameter, value) {
   name <- deparse(substitute(parameter))
-  if (!is.numeric(parameter) || parameter <= value) {
+  if (!is.numeric(parameter) || any(parameter <= value)) {
     ParallelLogger::logError(paste0(name, " needs to be > ", value))
     stop(paste0(name, " needs to be > ", value))
   }
@@ -56,7 +56,7 @@ checkHigher <- function(parameter, value) {
 
 checkLower <- function(parameter, value) {
   name <- deparse(substitute(parameter))
-  if (!is.numeric(parameter) || parameter >= value) {
+  if (!is.numeric(parameter) || any(parameter >= value)) {
     ParallelLogger::logError(paste0(name, " needs to be < ", value))
     stop(paste0(name, " needs to be < ", value))
   }
diff --git a/tests/testthat/test-dataSplitting.R b/tests/testthat/test-dataSplitting.R
index b8ce628bb..efd60fe21 100644
--- a/tests/testthat/test-dataSplitting.R
+++ b/tests/testthat/test-dataSplitting.R
@@ -2,13 +2,13 @@
 # Copyright 2021 Observational Health Data Sciences and Informatics
 #
 # This file is part of PatientLevelPrediction
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,30 +21,29 @@ context("Data splitting")
 # make sure pop is all plpData people
 
 populationT <- plpData$cohorts
-populationT$outcomeCount <- sample(c(0,1), nrow(populationT), replace = T)
+populationT$outcomeCount <- sample(c(0, 1), nrow(populationT), replace = TRUE)
 attr(populationT, "metaData")$outcomeId <- outcomeId
-attr(populationT, "metaData")$populationSettings <- list(madeup = T)
-attr(populationT, "metaData")$restrictPlpDataSettings <- list(madeup = T)
-attr(populationT, "metaData")$attrition <- c(1,2,3)
+attr(populationT, "metaData")$populationSettings <- list(madeup = TRUE)
+attr(populationT, "metaData")$restrictPlpDataSettings <- list(madeup = TRUE)
+attr(populationT, "metaData")$attrition <- c(1, 2, 3)
 
 # check correct inputs
-testFraction1 <- sample(9,1)/10
-trainFraction1 <- 1-testFraction1
-splitSeed1 <- sample(100000,1)
-nfold1 <- 1+sample(10,1)
-type1 <- sample(c('stratified', 'time', 'subject'), 1)
+testFraction1 <- sample(9, 1) / 10
+trainFraction1 <- 1 - testFraction1
+splitSeed1 <- sample(100000, 1)
+nfold1 <- 1 + sample(10, 1)
+type1 <- sample(c("stratified", "time", "subject"), 1)
 
 defaultSetting <- function(
-  testFraction = testFraction1, 
-  trainFraction = trainFraction1, 
-  splitSeed = splitSeed1, 
-  nfold = nfold1,
-  type = type1
-){
+    testFraction = testFraction1,
+    trainFraction = trainFraction1,
+    splitSeed = splitSeed1,
+    nfold = nfold1,
+    type = type1) {
   result <- createDefaultSplitSetting(
-    testFraction = testFraction, 
-    trainFraction = trainFraction, 
-    splitSeed = splitSeed, 
+    testFraction = testFraction,
+    trainFraction = trainFraction,
+    splitSeed = splitSeed,
     nfold = nfold,
     type = type
   )
@@ -53,369 +52,373 @@ defaultSetting <- function(
 
 
 test_that("createDefaultSplitSetting", {
-  
   splitSettings <- defaultSetting()
-  
-  expect_is(splitSettings, 'splitSettings')
-  
-  expectFun <- 'randomSplitter'
-  if(type1 == 'time'){
-    expectFun <- 'timeSplitter'
+
+  expect_is(splitSettings, "splitSettings")
+
+  expectFun <- "randomSplitter"
+  if (type1 == "time") {
+    expectFun <- "timeSplitter"
   }
-  if(type1 == 'subject'){
-    expectFun <- 'subjectSplitter'
+  if (type1 == "subject") {
+    expectFun <- "subjectSplitter"
   }
-  
+
   expect_equal(attr(splitSettings, "fun"), expectFun)
-  
+
   expect_equal(splitSettings$test, testFraction1)
   expect_equal(splitSettings$train, trainFraction1)
   expect_equal(splitSettings$seed, splitSeed1)
   expect_equal(splitSettings$nfold, nfold1)
-  
-  #check input errors for testFraction
+
+  # check input errors for testFraction
   expect_error(
-    defaultSetting(testFraction = 'character')
+    defaultSetting(testFraction = "character")
   )
-  
+
   expect_error(
     defaultSetting(testFraction = -0.1)
   )
-  
+
   expect_error(
     defaultSetting(testFraction = 1.001)
   )
-  
+
   # check input error for trainFraction
   expect_error(
-    defaultSetting(trainFraction = 'trainFraction')
+    defaultSetting(trainFraction = "trainFraction")
   )
-  
+
   expect_error(
     defaultSetting(trainFraction = 1.2)
   )
-  
+
   expect_error(
     defaultSetting(trainFraction = -0.2)
   )
-  
-  #check error for splitSeed
-  
+
+  # check error for splitSeed
+
   expect_error(
-    defaultSetting(splitSeed = NULL)  
+    defaultSetting(splitSeed = NULL)
   )
-  
+
   expect_error(
-    defaultSetting(splitSeed = 'NULL')  
+    defaultSetting(splitSeed = "NULL")
   )
-  
+
   # check error for nfold
   expect_error(
     defaultSetting(nfold = NULL)
   )
   expect_error(
-    defaultSetting(nfold = 'NULL')
+    defaultSetting(nfold = "NULL")
   )
-  
+
   # incorrect type
   expect_error(
-    defaultSetting(type = 'madeup')
+    defaultSetting(type = "madeup")
   )
   expect_error(
-    defaultSetting(type =  NULL)
+    defaultSetting(type = NULL)
   )
   expect_error(
-    defaultSetting(type =  1)
+    defaultSetting(type = 1)
   )
-  
 })
 
 
 
 test_that("Main split function: splitData", {
-  
   # check default settings with test/train
   splitSettings <- defaultSetting()
-  
+
   splitData <- splitData(
     plpData = plpData,
     population = populationT,
     splitSettings = splitSettings
   )
-  
+
   # check class
-  expect_is(splitData, 'splitData')
-  
+  expect_is(splitData, "splitData")
+
   # should have test/train
-  expect_equal(names(splitData), c('Train', 'Test'))
-  
+  expect_equal(names(splitData), c("Train", "Test"))
+
   # train and test are CovariateData
-  expect_is(splitData$Train$covariateData, 'CovariateData')
-  expect_is(splitData$Test$covariateData, 'CovariateData')
-  
+  expect_is(splitData$Train$covariateData, "CovariateData")
+  expect_is(splitData$Test$covariateData, "CovariateData")
+
   # Train has labels/folds/covariateData
-  expect_equal(names(splitData$Train), c('labels', 'folds', 'covariateData'))
-  
+  expect_equal(names(splitData$Train), c("labels", "folds", "covariateData"))
+
   # Test has labels/covariateData
-  expect_equal(names(splitData$Test), c('labels', 'covariateData'))
-  
+  expect_equal(names(splitData$Test), c("labels", "covariateData"))
+
   # check attributes for Train
   expect_equal(attr(splitData$Train, "metaData")$outcomeId, attr(populationT, "metaData")$outcomeId)
   expect_equal(attr(splitData$Train, "metaData")$targetId, plpData$metaData$databaseDetails$targetId)
   expect_equal(
-    attr(splitData$Train, "metaData")$cdmDatabaseSchema, 
+    attr(splitData$Train, "metaData")$cdmDatabaseSchema,
     plpData$metaData$databaseDetails$cdmDatabaseSchema
-    )
-  
-  expect_is(attr(splitData$Train, "metaData")$restrictPlpDataSettings, 'list')
+  )
+
+  expect_is(attr(splitData$Train, "metaData")$restrictPlpDataSettings, "list")
   expect_equal(
-    attr(splitData$Train, "metaData")$covariateSettings, 
+    attr(splitData$Train, "metaData")$covariateSettings,
     plpData$metaData$covariateSettings
   )
   expect_equal(
-    attr(splitData$Train, "metaData")$populationSettings, 
+    attr(splitData$Train, "metaData")$populationSettings,
     attr(populationT, "metaData")$populationSettings
   )
   expect_equal(
-    attr(splitData$Train, "metaData")$attrition, 
+    attr(splitData$Train, "metaData")$attrition,
     attr(populationT, "metaData")$attrition
   )
-  
+
   expect_equal(
-    attr(splitData$Train, "metaData")$splitSettings, 
+    attr(splitData$Train, "metaData")$splitSettings,
     splitSettings
   )
-  
+
   # train+test should be full data as train+test = 1
   expect_equal(
-    nrow(splitData$Train$labels) + nrow(splitData$Test$labels), 
+    nrow(splitData$Train$labels) + nrow(splitData$Test$labels),
     nrow(populationT)
   )
   expect_equal(
-    splitData$Train$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() + 
-      splitData$Test$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull(), 
+    splitData$Train$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() +
+      splitData$Test$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull(),
     plpData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull()
   )
-  
+
   # make sure nfolds is correct
   expect_equal(
     min(splitData$Train$folds$index),
     1
   )
-  
+
   expect_equal(
     max(splitData$Train$folds$index),
     splitSettings$nfold
   )
-  
-  
+
+
   # check when test is 0
   splitSettings <- defaultSetting(
-    testFraction = 0, 
+    testFraction = 0,
     trainFraction = 1
-    )
-  
+  )
+
   splitData <- splitData(
     plpData = plpData,
     population = populationT,
     splitSettings = splitSettings
   )
-  
+
   # should just have train
-  expect_equal(names(splitData), c('Train'))
-  
+  expect_equal(names(splitData), c("Train"))
+
   # train labels should be the same size at the population
   expect_equal(
-    nrow(splitData$Train$labels), 
+    nrow(splitData$Train$labels),
     nrow(populationT)
   )
   expect_equal(
-    splitData$Train$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull(), 
+    splitData$Train$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull(),
     plpData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull()
   )
-  
 })
 
 test_that("dataSummary works", {
-  
   splitSettings <- defaultSetting(
-    testFraction = 0, 
+    testFraction = 0,
     trainFraction = 1
   )
-  
+
   splitData <- splitData(
     plpData = plpData,
     population = populationT,
     splitSettings = splitSettings
   )
 
-summaryPrint <- dataSummary(splitData)
-expect_equal(summaryPrint, TRUE)
-  
+  summaryPrint <- dataSummary(splitData)
+  expect_equal(summaryPrint, TRUE)
 })
 
 
 test_that("Data stratified splitting", {
-  
   splitSettings <- defaultSetting(
-    test=0.3, 
-    nfold=3
-    )
+    test = 0.3,
+    nfold = 3
+  )
 
   # error due to insufficient outcomes
-  DSpopulation1 <- data.frame(rowId=1:20, outcomeCount=c(1,1,1,1,rep(0,16))) 
-  expect_error(randomSplitter(population = DSpopulation1, splitSettings = splitSettings))
-  
-  DSpopulation2 <- data.frame(rowId=1:200, outcomeCount=c(rep(1,42),rep(0,158)))
+  dsPopulation1 <- data.frame(rowId = 1:20, outcomeCount = c(1, 1, 1, 1, rep(0, 16)))
+  expect_error(randomSplitter(population = dsPopulation1, splitSettings = splitSettings))
+
+  dsPopulation2 <- data.frame(rowId = 1:200, outcomeCount = c(rep(1, 42), rep(0, 158)))
   splitSettings <- defaultSetting(
     train = 0.7,
-    test = 0.3, 
+    test = 0.3,
     nfold = 4
   )
   # fold creation check 1 (fixed)
-  test <- randomSplitter(population = DSpopulation2, splitSettings = splitSettings)
-  test <- merge(DSpopulation2, test)
+  test <- randomSplitter(population = dsPopulation2, splitSettings = splitSettings)
+  test <- merge(DsPopulation2, test)
   test <- table(test$outcomeCount, test$index)
-  test.returned <- paste(test, collapse='-')
-  test.expected <- paste(matrix(c(47,28,28,28,27,12,8,8,7,7), ncol=5, byrow=T),collapse='-')
-  expect_identical(test.returned, test.expected)
-  
+  testReturned <- paste(test, collapse = "-")
+  testExpected <- paste(matrix(c(47, 28, 28, 28, 27, 12, 8, 8, 7, 7), ncol = 5, byrow = TRUE), collapse = "-")
+  expect_identical(testReturned, testExpected)
+
   # fold creation check 2 (sum)
   size <- 500
-  DSpopulation3 <- data.frame(rowId=1:size, outcomeCount=c(rep(1,floor(size/3)),rep(0,size-floor(size/3)))) 
+  dsPopulation3 <- data.frame(rowId = 1:size, outcomeCount = c(rep(1, floor(size / 3)), rep(0, size - floor(size / 3))))
   splitSettings <- defaultSetting(
     train = 0.8,
-    test = 0.2, 
+    test = 0.2,
     nfold = 4
   )
-  test <- randomSplitter(population = DSpopulation3, splitSettings = splitSettings) 
+  test <- randomSplitter(population = dsPopulation3, splitSettings = splitSettings)
   test <- merge(DSpopulation3, test)
   test <- table(test$outcomeCount, test$index)
   expect_that(sum(test), equals(size))
-  
+
   # test the training fraction parameter for learning curves
-  size = 500
-  DSpopulation4 <- data.frame(rowId=1:size,
-                            outcomeCount=c(rep(1,floor(size/3)),
-                                           rep(0,size-floor(size/3))))
+  size <- 500
+  dsPopulation4 <- data.frame(
+    rowId = 1:size,
+    outcomeCount = c(
+      rep(1, floor(size / 3)),
+      rep(0, size - floor(size / 3))
+    )
+  )
   splitSettings <- defaultSetting(
     train = 0.4,
-    test = 0.2, 
+    test = 0.2,
     nfold = 4
   )
-  test <- randomSplitter(population = DSpopulation4, splitSettings = splitSettings)
+  test <- randomSplitter(population = dsPopulation4, splitSettings = splitSettings)
 
-  tolerance = 5
-  excludedPatients = 200
+  tolerance <- 5
+  excludedPatients <- 200
   # test, if the number of patients in each fold are roughly the same
   expect_equal(length(test$index[test$index == 1]),
-               length(test$index[test$index == 3]),
-               tolerance = tolerance)
+    length(test$index[test$index == 3]),
+    tolerance = tolerance
+  )
   expect_equal(length(test$index[test$index == 2]),
-               length(test$index[test$index == 4]),
-               tolerance = tolerance)
+    length(test$index[test$index == 4]),
+    tolerance = tolerance
+  )
   expect_equal(length(test$index[test$index == 1]),
-               length(test$index[test$index == 4]),
-               tolerance = tolerance)
+    length(test$index[test$index == 4]),
+    tolerance = tolerance
+  )
   # test, if patients were excluded according to the training fraction
-  expect_equal(length(test$index[test$index == 0]),
-               excludedPatients)
+  expect_equal(
+    length(test$index[test$index == 0]),
+    excludedPatients
+  )
 })
 
 test_that("Data splitting by time", {
-  
   # fold creation check (sum)
   size <- 500
   set.seed(1)
-  DSpopulation2 <- data.frame(
-    rowId=1:size, 
-    outcomeCount=sample(0:1,size,replace=TRUE),
+  dsPopulation2 <- data.frame(
+    rowId = 1:size,
+    outcomeCount = sample(0:1, size, replace = TRUE),
     cohortStartDate = as.Date("2010-01-01") + c(1:size)
-    )
+  )
   splitSettings <- defaultSetting(
     train = 0.8,
-    test = 0.2, 
+    test = 0.2,
     nfold = 4
   )
-  
-  test <- timeSplitter(population = DSpopulation2, splitSettings = splitSettings) 
-  test <- merge(DSpopulation2, test)
+
+  test <- timeSplitter(population = dsPopulation2, splitSettings = splitSettings)
+  test <- merge(dsPopulation2, test)
   test <- table(test$outcomeCount, test$index)
   expect_that(sum(test), equals(size))
-  
+
   # test the training fraction parameter for learning curves
   size <- 500
   set.seed(1)
-  DSpopulation3 <- data.frame(rowId=1:size,
-                            outcomeCount=sample(0:1,size,replace=TRUE),
-                            cohortStartDate = as.Date("2010-01-01") + c(1:size))
+  dsPopulation3 <- data.frame(
+    rowId = 1:size,
+    outcomeCount = sample(0:1, size, replace = TRUE),
+    cohortStartDate = as.Date("2010-01-01") + c(1:size)
+  )
   splitSettings <- defaultSetting(
     train = 0.4,
-    test = 0.2, 
+    test = 0.2,
     nfold = 4
   )
-  test <- timeSplitter(population = DSpopulation3, splitSettings = splitSettings)
-  
-  tolerance = 5
-  excludedPatients = 196
+  test <- timeSplitter(population = dsPopulation3, splitSettings = splitSettings)
+
+  tolerance <- 5
+  excludedPatients <- 196
   # test, if the number of patients in each fold are roughly the same
   expect_equal(length(test$index[test$index == 1]),
-               length(test$index[test$index == 3]),
-               tolerance = tolerance)
+    length(test$index[test$index == 3]),
+    tolerance = tolerance
+  )
   expect_equal(length(test$index[test$index == 2]),
-               length(test$index[test$index == 4]),
-               tolerance = tolerance)
+    length(test$index[test$index == 4]),
+    tolerance = tolerance
+  )
   expect_equal(length(test$index[test$index == 1]),
-               length(test$index[test$index == 4]),
-               tolerance = tolerance)
+    length(test$index[test$index == 4]),
+    tolerance = tolerance
+  )
   # test, if patients were excluded according to the training fraction
-  expect_equal(length(test$index[test$index == 0]),
-               excludedPatients)
-  
+  expect_equal(
+    length(test$index[test$index == 0]),
+    excludedPatients
+  )
 })
 
 
 
 test_that("Data splitting by subject", {
-# error message checks
-  DSpopulation1 <- data.frame(rowId=1:20, subjectId = 1:20, outcomeCount=c(1,1,1,1,rep(0,16))) 
+  # error message checks
+  dsPopulation1 <- data.frame(rowId = 1:20, subjectId = 1:20, outcomeCount = c(1, 1, 1, 1, rep(0, 16)))
   splitSettings <- defaultSetting(
     train = 0.7,
-    test = 0.3, 
+    test = 0.3,
     nfold = 3
   )
-  expect_error(subjectSplitter(population = DSpopulation1, splitSettings = splitSettings ))
+  expect_error(subjectSplitter(population = DSpopulation1, splitSettings = splitSettings))
 
-  DSpopulation2 <- data.frame(rowId=1:200,subjectId = 1:200, outcomeCount=c(rep(1,42),rep(0,158)))
+  dsPopulation2 <- data.frame(rowId = 1:200, subjectId = 1:200, outcomeCount = c(rep(1, 42), rep(0, 158)))
   splitSettings <- defaultSetting(
     train = 0.8,
-    test = 0.2, 
+    test = 0.2,
     nfold = 4
   )
   test <- subjectSplitter(population = DSpopulation2, splitSettings = splitSettings)
   test <- merge(DSpopulation2, test)
   test <- table(test$outcomeCount, test$index)
-  test.returned <- paste(test, collapse='-')
-  test.expected <- paste(matrix(c(32,32,32,31,31,8,9,9,8,8), ncol=5, byrow=T),collapse='-')
-  expect_identical(test.returned, test.expected)
+  testReturned <- paste(test, collapse = "-")
+  testExpected <- paste(matrix(c(32, 32, 32, 31, 31, 8, 9, 9, 8, 8), ncol = 5, byrow = TRUE), collapse = "-")
+  expect_identical(testReturned, testExpected)
 
-# test that people are not in multiple folds
-  DSpopulation3 <- data.frame(rowId=1:200,subjectId = rep(1:50,4), outcomeCount=c(rep(1,42),rep(0,158)))
+  # test that people are not in multiple folds
+  dsPopulation3 <- data.frame(rowId = 1:200, subjectId = rep(1:50, 4), outcomeCount = c(rep(1, 42), rep(0, 158)))
   splitSettings <- defaultSetting(
     train = 0.75,
-    test = 0.25, 
+    test = 0.25,
     nfold = 3
   )
-  test <- subjectSplitter(population = DSpopulation3, splitSettings  = splitSettings )
+  test <- subjectSplitter(population = DSpopulation3, splitSettings = splitSettings)
   test <- merge(DSpopulation3, test)
 
-  expect_equal(unique(table(test$subjectId[test$index==-1])), 4)
-  expect_equal(unique(table(test$subjectId[test$index==2])), 4)
-  expect_equal(unique(table(test$subjectId[test$index==3])), 4)
-  expect_equal(unique(table(test$subjectId[test$index==1])), 4)
-  
-# test that no subject is not assigned a fold
-  expect_equal(sum(test$index==0), 0)
+  expect_equal(unique(table(test$subjectId[test$index == -1])), 4)
+  expect_equal(unique(table(test$subjectId[test$index == 2])), 4)
+  expect_equal(unique(table(test$subjectId[test$index == 3])), 4)
+  expect_equal(unique(table(test$subjectId[test$index == 1])), 4)
 
 
 })