From 7bddc3871bf4385241a0db892956e72c47ffa655 Mon Sep 17 00:00:00 2001 From: egillax Date: Mon, 18 Nov 2024 21:09:33 +0100 Subject: [PATCH] docs --- NAMESPACE | 1 + man/createDefaultSplitSetting.Rd | 42 +++++++++++++++++++----------- man/createExistingSplitSettings.Rd | 23 ++++++++++++++++ man/splitData.Rd | 36 ++++++++++++++++--------- 4 files changed, 74 insertions(+), 28 deletions(-) create mode 100644 man/createExistingSplitSettings.Rd diff --git a/NAMESPACE b/NAMESPACE index 5ce71cbba..ad98ff62d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,6 +22,7 @@ export(createDatabaseSchemaSettings) export(createDefaultExecuteSettings) export(createDefaultSplitSetting) export(createExecuteSettings) +export(createExistingSplitSettings) export(createFeatureEngineeringSettings) export(createGlmModel) export(createLearningCurve) diff --git a/man/createDefaultSplitSetting.Rd b/man/createDefaultSplitSetting.Rd index b9020bd45..e63e74375 100644 --- a/man/createDefaultSplitSetting.Rd +++ b/man/createDefaultSplitSetting.Rd @@ -2,8 +2,9 @@ % Please edit documentation in R/DataSplitting.R \name{createDefaultSplitSetting} \alias{createDefaultSplitSetting} -\title{Create the settings for defining how the plpData are split into test/validation/train sets using -default splitting functions (either random stratified by outcome, time or subject splitting)} +\title{Create the settings for defining how the plpData are split into +test/validation/train sets using default splitting functions +(either random stratified by outcome, time or subject splitting)} \usage{ createDefaultSplitSetting( testFraction = 0.25, @@ -14,28 +15,39 @@ createDefaultSplitSetting( ) } \arguments{ -\item{testFraction}{(numeric) A real number between 0 and 1 indicating the test set fraction of the data} +\item{testFraction}{(numeric) A real number between 0 and 1 +indicating the test set fraction of the data} -\item{trainFraction}{(numeric) A real number between 0 and 1 indicating the train set fraction of the data. -If not set train is equal to 1 - test} +\item{trainFraction}{(numeric) A real number between 0 and 1 indicating the +train set fraction of the data. If not set train is equal to 1 - test} -\item{splitSeed}{(numeric) A seed to use when splitting the data for reproducibility (if not set a random number will be generated)} +\item{splitSeed}{(numeric) A seed to use when splitting the data for +reproducibility (if not set a random number will be generated)} -\item{nfold}{(numeric) An integer > 1 specifying the number of folds used in cross validation} +\item{nfold}{(numeric) An integer > 1 specifying the number of +folds used in cross validation} -\item{type}{(character) Choice of: \itemize{ -\item'stratified' Each data point is randomly assigned into the test or a train fold set but this is done stratified such that the outcome rate is consistent in each partition -\item'time' Older data are assigned into the training set and newer data are assigned into the test set -\item'subject' Data are partitioned by subject, if a subject is in the data more than once, all the data points for the subject are assigned either into the test data or into the train data (not both). -}} +\item{type}{(character) Choice of: \itemize{ + \item'stratified' Each data point is +randomly assigned into the test or a train fold set but this is done +stratified such that the outcome rate is consistent in each partition + \item'time' Older data are assigned +into the training set and newer data are assigned into the test set + \item'subject' Data are partitioned by +subject, if a subject is in the data more than once, all the data points for +the subject are assigned either into the test data or into the train data +(not both). + }} } \value{ An object of class \code{splitSettings} } \description{ -Create the settings for defining how the plpData are split into test/validation/train sets using -default splitting functions (either random stratified by outcome, time or subject splitting) +Create the settings for defining how the plpData are split into +test/validation/train sets using default splitting functions +(either random stratified by outcome, time or subject splitting) } \details{ -Returns an object of class \code{splitSettings} that specifies the splitting function that will be called and the settings +Returns an object of class \code{splitSettings} that specifies the +splitting function that will be called and the settings } diff --git a/man/createExistingSplitSettings.Rd b/man/createExistingSplitSettings.Rd new file mode 100644 index 000000000..18ffda495 --- /dev/null +++ b/man/createExistingSplitSettings.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DataSplitting.R +\name{createExistingSplitSettings} +\alias{createExistingSplitSettings} +\title{Create the settings for defining how the plpData are split into +test/validation/train sets using an existing split - good to use for +reproducing results from a different run} +\usage{ +createExistingSplitSettings(splitIds) +} +\arguments{ +\item{splitIds}{(data.frame) A data frame with rowId and index columns of +type integer/numeric. Index is -1 for test set, positive integer for train +set folds} +} +\value{ +An object of class \code{splitSettings} +} +\description{ +Create the settings for defining how the plpData are split into +test/validation/train sets using an existing split - good to use for +reproducing results from a different run +} diff --git a/man/splitData.Rd b/man/splitData.Rd index 4c03f61b8..a4368cf4e 100644 --- a/man/splitData.Rd +++ b/man/splitData.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/DataSplitting.R \name{splitData} \alias{splitData} -\title{Split the plpData into test/train sets using a splitting settings of class \code{splitSettings}} +\title{Split the plpData into test/train sets using a splitting settings of class +\code{splitSettings}} \usage{ splitData( plpData = plpData, @@ -11,29 +12,38 @@ splitData( ) } \arguments{ -\item{plpData}{An object of type \code{plpData} - the patient level prediction -data extracted from the CDM.} +\item{plpData}{An object of type \code{plpData} - the patient level +prediction data extracted from the CDM.} -\item{population}{The population created using \code{createStudyPopulation} that define who will be used to develop the model} +\item{population}{The population created using \code{createStudyPopulation} +that define who will be used to develop the model} -\item{splitSettings}{An object of type \code{splitSettings} specifying the split - the default can be created using \code{createDefaultSplitSetting}} +\item{splitSettings}{An object of type \code{splitSettings} specifying the +split - the default can be created using \code{createDefaultSplitSetting}} } \value{ An object of class \code{splitSettings} } \description{ -Split the plpData into test/train sets using a splitting settings of class \code{splitSettings} +Split the plpData into test/train sets using a splitting settings of class +\code{splitSettings} } \details{ -Returns a list containing the training data (Train) and optionally the test data (Test). Train is an Andromeda object containing -\itemize{\item covariates: a table (rowId, covariateId, covariateValue) containing the covariates for each data point in the train data +Returns a list containing the training data (Train) and optionally the test +data (Test). Train is an Andromeda object containing +\itemize{\item covariates: a table (rowId, covariateId, covariateValue) +containing the covariates for each data point in the train data \item covariateRef: a table with the covariate information - \item labels: a table (rowId, outcomeCount, ...) for each data point in the train data (outcomeCount is the class label) - \item folds: a table (rowId, index) specifying which training fold each data point is in. - } + \item labels: a table (rowId, outcomeCount, ...) for each data point +in the train data (outcomeCount is the class label) + \item folds: a table (rowId, index) specifying which training +fold each data point is in. + } Test is an Andromeda object containing -\itemize{\item covariates: a table (rowId, covariateId, covariateValue) containing the covariates for each data point in the test data +\itemize{\item covariates: a table (rowId, covariateId, covariateValue) +containing the covariates for each data point in the test data \item covariateRef: a table with the covariate information - \item labels: a table (rowId, outcomeCount, ...) for each data point in the test data (outcomeCount is the class label) + \item labels: a table (rowId, outcomeCount, ...) for each data +point in the test data (outcomeCount is the class label) } }