diff --git a/DESCRIPTION b/DESCRIPTION index 7d2f0bed..db134add 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -35,13 +35,11 @@ Imports: Matrix, memuse, ParallelLogger (>= 2.0.0), - polspline, pROC, PRROC, reticulate (>= 1.30), rlang, SqlRender (>= 1.1.3), - survival, tidyr, utils Suggests: @@ -56,6 +54,7 @@ Suggests: mgcv, parallel, plyr, + polspline, pool, readr, ResourceSelection, @@ -64,6 +63,7 @@ Suggests: RSQLite, scoring, ShinyAppBuilder (>= 1.1.1), + survival, survminer, testthat, withr, diff --git a/PatientLevelPrediction.Rproj b/PatientLevelPrediction.Rproj index 6284fc10..396fe126 100644 --- a/PatientLevelPrediction.Rproj +++ b/PatientLevelPrediction.Rproj @@ -1,4 +1,5 @@ Version: 1.0 +ProjectId: 6acb9f49-7428-4e24-8a2a-6b10f35b95e2 RestoreWorkspace: No SaveWorkspace: No diff --git a/R/CyclopsModels.R b/R/CyclopsModels.R index 760f151d..28131b1f 100644 --- a/R/CyclopsModels.R +++ b/R/CyclopsModels.R @@ -18,42 +18,40 @@ fitCyclopsModel <- function( - trainData, - modelSettings, # old:param, - search='adaptive', - analysisId, - ...){ - + trainData, + modelSettings, # old:param, + search = "adaptive", + analysisId, + ...) { param <- modelSettings$param - + # check plpData is coo format: - if (!FeatureExtraction::isCovariateData(trainData$covariateData)){ + if (!FeatureExtraction::isCovariateData(trainData$covariateData)) { stop("Needs correct covariateData") } - - settings <- attr(param, 'settings') - - trainData$covariateData$labels <- trainData$labels %>% + + settings <- attr(param, "settings") + + trainData$covariateData$labels <- trainData$labels %>% dplyr::mutate( - y = sapply(.data$outcomeCount, function(x) min(1,x)), + y = sapply(.data$outcomeCount, function(x) min(1, x)), time = .data$survivalTime ) - + covariates <- filterCovariateIds(param, trainData$covariateData) - + if (!is.null(param$priorCoefs)) { sourceCoefs <- param$priorCoefs %>% - dplyr::filter(abs(.data$betas)>0 & .data$covariateIds != "(Intercept)") - + dplyr::filter(abs(.data$betas) > 0 & .data$covariateIds != "(Intercept)") + newCovariates <- covariates %>% dplyr::filter(.data$covariateId %in% !!sourceCoefs$covariateIds) %>% - dplyr::mutate(newCovariateId = .data$covariateId*-1) %>% + dplyr::mutate(newCovariateId = .data$covariateId * -1) %>% dplyr::select(-"covariateId") %>% dplyr::rename(covariateId = .data$newCovariateId) %>% dplyr::collect() - + Andromeda::appendToTable(covariates, newCovariates) - } start <- Sys.time() @@ -66,155 +64,161 @@ fitCyclopsModel <- function( checkRowIds = FALSE, normalize = NULL, quiet = TRUE - ) - + ) + if (!is.null(param$priorCoefs)) { - fixedCoefficients <- c(FALSE, - rep(TRUE, nrow(sourceCoefs)), - rep(FALSE, length(cyclopsData$coefficientNames)-(nrow(sourceCoefs)+1))) - + fixedCoefficients <- c( + FALSE, + rep(TRUE, nrow(sourceCoefs)), + rep(FALSE, length(cyclopsData$coefficientNames) - (nrow(sourceCoefs) + 1)) + ) + startingCoefficients <- rep(0, length(fixedCoefficients)) - + # skip intercept index - startingCoefficients[2:(nrow(sourceCoefs)+1)] <- sourceCoefs$betas + startingCoefficients[2:(nrow(sourceCoefs) + 1)] <- sourceCoefs$betas } else { startingCoefficients <- NULL - fixedCoefficients <- NULL + fixedCoefficients <- NULL } - if(settings$crossValidationInPrior){ - param$priorParams$useCrossValidation <- max(trainData$folds$index)>1 + if (settings$crossValidationInPrior) { + param$priorParams$useCrossValidation <- max(trainData$folds$index) > 1 } prior <- do.call(eval(parse(text = settings$priorfunction)), param$priorParams) - if(settings$useControl){ - + if (settings$useControl) { control <- Cyclops::createControl( cvType = "auto", fold = max(trainData$folds$index), startingVariance = param$priorParams$variance, lowerLimit = param$lowerLimit, upperLimit = param$upperLimit, - tolerance = settings$tolerance, + tolerance = settings$tolerance, cvRepetitions = 1, # make an option? selectorType = settings$selectorType, noiseLevel = "silent", threads = settings$threads, maxIterations = settings$maxIterations, seed = settings$seed - ) - - fit <- tryCatch({ - ParallelLogger::logInfo('Running Cyclops') - Cyclops::fitCyclopsModel( - cyclopsData = cyclopsData, - prior = prior, - control = control, - fixedCoefficients = fixedCoefficients, - startingCoefficients = startingCoefficients - )}, - finally = ParallelLogger::logInfo('Done.') - ) - } else{ - fit <- tryCatch({ - ParallelLogger::logInfo('Running Cyclops with fixed varience') - Cyclops::fitCyclopsModel(cyclopsData, prior = prior)}, - finally = ParallelLogger::logInfo('Done.')) + ) + + fit <- tryCatch( + { + ParallelLogger::logInfo("Running Cyclops") + Cyclops::fitCyclopsModel( + cyclopsData = cyclopsData, + prior = prior, + control = control, + fixedCoefficients = fixedCoefficients, + startingCoefficients = startingCoefficients + ) + }, + finally = ParallelLogger::logInfo("Done.") + ) + } else { + fit <- tryCatch( + { + ParallelLogger::logInfo("Running Cyclops with fixed varience") + Cyclops::fitCyclopsModel(cyclopsData, prior = prior) + }, + finally = ParallelLogger::logInfo("Done.") + ) } - - + + modelTrained <- createCyclopsModel( - fit = fit, - modelType = settings$modelType, - useCrossValidation = max(trainData$folds$index)>1, - cyclopsData = cyclopsData, + fit = fit, + modelType = settings$modelType, + useCrossValidation = max(trainData$folds$index) > 1, + cyclopsData = cyclopsData, labels = trainData$covariateData$labels, folds = trainData$folds, priorType = param$priorParams$priorType - ) - + ) + if (!is.null(param$priorCoefs)) { modelTrained$coefficients <- reparamTransferCoefs(modelTrained$coefficients) } - + # TODO get optimal lambda value - ParallelLogger::logTrace('Returned from fitting to LassoLogisticRegression') + ParallelLogger::logTrace("Returned from fitting to LassoLogisticRegression") comp <- Sys.time() - start - ParallelLogger::logTrace('Getting variable importance') + ParallelLogger::logTrace("Getting variable importance") variableImportance <- getVariableImportance(modelTrained, trainData) - - #get prediction on test set: - ParallelLogger::logTrace('Getting predictions on train set') + + # get prediction on test set: + ParallelLogger::logTrace("Getting predictions on train set") tempModel <- list(model = modelTrained) - attr(tempModel, "modelType") <- attr(param, 'modelType') + attr(tempModel, "modelType") <- attr(param, "modelType") prediction <- predictCyclops( plpModel = tempModel, - cohort = trainData$labels, + cohort = trainData$labels, data = trainData - ) - prediction$evaluationType <- 'Train' - + ) + prediction$evaluationType <- "Train" + # get cv AUC if exists cvPerFold <- data.frame() - if(!is.null(modelTrained$cv)){ - cvPrediction <- do.call(rbind, lapply(modelTrained$cv, function(x){x$predCV})) - cvPrediction$evaluationType <- 'CV' - # fit date issue convertion caused by andromeda - cvPrediction$cohortStartDate <- as.Date(cvPrediction$cohortStartDate, origin = '1970-01-01') - - prediction <- rbind(prediction, cvPrediction[,colnames(prediction)]) - - cvPerFold <- unlist(lapply(modelTrained$cv, function(x){x$out_sample_auc})) - if(length(cvPerFold)>0){ + if (!is.null(modelTrained$cv)) { + cvPrediction <- do.call(rbind, lapply(modelTrained$cv, function(x) { + x$predCV + })) + cvPrediction$evaluationType <- "CV" + # fit date issue convertion caused by andromeda + cvPrediction$cohortStartDate <- as.Date(cvPrediction$cohortStartDate, origin = "1970-01-01") + + prediction <- rbind(prediction, cvPrediction[, colnames(prediction)]) + + cvPerFold <- unlist(lapply(modelTrained$cv, function(x) { + x$out_sample_auc + })) + if (length(cvPerFold) > 0) { cvPerFold <- data.frame( - metric = 'AUC', + metric = "AUC", fold = 1:length(cvPerFold), value = cvPerFold, - startingVariance = ifelse(is.null(param$priorParams$variance), 'NULL', param$priorParams$variance), - lowerLimit = ifelse(is.null(param$lowerLimit), 'NULL', param$lowerLimit), - upperLimit = ifelse(is.null(param$upperLimit), 'NULL', param$upperLimit), - tolerance = ifelse(is.null(settings$tolerance), 'NULL', settings$tolerance) + startingVariance = ifelse(is.null(param$priorParams$variance), "NULL", param$priorParams$variance), + lowerLimit = ifelse(is.null(param$lowerLimit), "NULL", param$lowerLimit), + upperLimit = ifelse(is.null(param$upperLimit), "NULL", param$upperLimit), + tolerance = ifelse(is.null(settings$tolerance), "NULL", settings$tolerance) ) - } else{ + } else { cvPerFold <- data.frame() } - + # remove the cv from the model: modelTrained$cv <- NULL } - + result <- list( model = modelTrained, - preprocessing = list( - featureEngineering = attr(trainData, "metaData")$featureEngineering,#learned mapping - tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, #learned mapping + featureEngineering = attr(trainData, "metaData")$featureEngineering, # learned mapping + tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, # learned mapping requireDenseMatrix = F ), - prediction = prediction, - modelDesign = PatientLevelPrediction::createModelDesign( targetId = attr(trainData, "metaData")$targetId, # added outcomeId = attr(trainData, "metaData")$outcomeId, # added restrictPlpDataSettings = attr(trainData, "metaData")$restrictPlpDataSettings, # made this restrictPlpDataSettings covariateSettings = attr(trainData, "metaData")$covariateSettings, - populationSettings = attr(trainData, "metaData")$populationSettings, + populationSettings = attr(trainData, "metaData")$populationSettings, featureEngineeringSettings = attr(trainData, "metaData")$featureEngineeringSettings, preprocessSettings = attr(trainData$covariateData, "metaData")$preprocessSettings, - modelSettings = modelSettings, #modified + modelSettings = modelSettings, # modified splitSettings = attr(trainData, "metaData")$splitSettings, sampleSettings = attr(trainData, "metaData")$sampleSettings ), - trainDetails = list( - analysisId = analysisId, - analysisSource = '', #TODO add from model + analysisId = analysisId, + analysisSource = "", # TODO add from model developmentDatabase = attr(trainData, "metaData")$cdmDatabaseName, - developmentDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema, - attrition = attr(trainData, "metaData")$attrition, - trainingTime = paste(as.character(abs(comp)), attr(comp,'units')), + developmentDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema, + attrition = attr(trainData, "metaData")$attrition, + trainingTime = paste(as.character(abs(comp)), attr(comp, "units")), trainingDate = Sys.Date(), modelName = settings$modelType, finalModelParameters = list( @@ -223,15 +227,14 @@ fitCyclopsModel <- function( ), hyperParamSearch = cvPerFold ), - covariateImportance = variableImportance ) - - - class(result) <- 'plpModel' - attr(result, 'predictionFunction') <- 'predictCyclops' - attr(result, 'modelType') <- attr(param, 'modelType') - attr(result, 'saveType') <- attr(param, 'saveType') + + + class(result) <- "plpModel" + attr(result, "predictionFunction") <- "predictCyclops" + attr(result, "modelType") <- attr(param, "modelType") + attr(result, "saveType") <- attr(param, "saveType") return(result) } @@ -248,21 +251,21 @@ fitCyclopsModel <- function( #' #' @param plpModel An object of type \code{predictiveModel} as generated using #' \code{\link{fitPlp}}. -#' @param data The new plpData containing the covariateData for the new population +#' @param data The new plpData containing the covariateData for the new population #' @param cohort The cohort to calculate the prediction for #' @export -predictCyclops <- function(plpModel, data, cohort ) { +predictCyclops <- function(plpModel, data, cohort) { start <- Sys.time() - - ParallelLogger::logTrace('predictProbabilities - predictAndromeda start') - + + ParallelLogger::logTrace("predictProbabilities - predictAndromeda start") + prediction <- predictCyclopsType( plpModel$model$coefficients, cohort, data$covariateData, plpModel$model$modelType ) - + # survival cyclops use baseline hazard to convert to risk from exp(LP) to 1-S^exp(LP) if (attr(plpModel, "modelType") == "survival") { if (!is.null(plpModel$model$baselineSurvival)) { @@ -274,17 +277,17 @@ predictCyclops <- function(plpModel, data, cohort ) { bhind <- which.min(abs(plpModel$model$baselineSurvival$time - timepoint)) # 1- baseline survival(time)^ (exp(betas*values)) prediction$value <- 1 - plpModel$model$baselineSurvival$surv[bhind]^prediction$value - - + + metaData <- list() metaData$baselineSurvivalTimepoint <- plpModel$model$baselineSurvival$time[bhind] metaData$baselineSurvival <- plpModel$model$baselineSurvival$surv[bhind] metaData$offset <- 0 - + attr(prediction, "metaData") <- metaData } } - + delta <- Sys.time() - start ParallelLogger::logInfo("Prediction took ", signif(delta, 3), " ", attr(delta, "units")) return(prediction) @@ -297,25 +300,26 @@ predictCyclopsType <- function(coefficients, population, covariateData, modelTyp if (!FeatureExtraction::isCovariateData(covariateData)) { stop("Needs correct covariateData") } - + intercept <- coefficients$betas[coefficients$covariateIds %in% "(Intercept)"] if (length(intercept) == 0) intercept <- 0 betas <- coefficients$betas[!coefficients$covariateIds %in% "(Intercept)"] - coefficients <- data.frame(beta = betas, + coefficients <- data.frame( + beta = betas, covariateId = coefficients$covariateIds[coefficients$covariateIds != "(Intercept)"] ) coefficients <- coefficients[coefficients$beta != 0, ] if (sum(coefficients$beta != 0) > 0) { covariateData$coefficients <- coefficients on.exit(covariateData$coefficients <- NULL, add = TRUE) - - prediction <- covariateData$covariates %>% - dplyr::inner_join(covariateData$coefficients, by = "covariateId") %>% + + prediction <- covariateData$covariates %>% + dplyr::inner_join(covariateData$coefficients, by = "covariateId") %>% dplyr::mutate(values = .data$covariateValue * .data$beta) %>% dplyr::group_by(.data$rowId) %>% dplyr::summarise(value = sum(.data$values, na.rm = TRUE)) %>% dplyr::select("rowId", "value") - + prediction <- as.data.frame(prediction) prediction <- merge(population, prediction, by = "rowId", all.x = TRUE, fill = 0) prediction$value[is.na(prediction$value)] <- 0 @@ -332,35 +336,32 @@ predictCyclopsType <- function(coefficients, population, covariateData, modelTyp prediction$value <- link(prediction$value) attr(prediction, "metaData")$modelType <- "binary" } else if (modelType == "poisson" || modelType == "survival" || modelType == "cox") { - # add baseline hazard stuff - + prediction$value <- exp(prediction$value) attr(prediction, "metaData")$modelType <- "survival" if (modelType == "survival") { # is this needed? attr(prediction, "metaData")$timepoint <- max(population$survivalTime, na.rm = TRUE) } - } return(prediction) } createCyclopsModel <- function(fit, modelType, useCrossValidation, cyclopsData, labels, folds, - priorType){ - + priorType) { if (is.character(fit)) { coefficients <- c(0) - names(coefficients) <- '' + names(coefficients) <- "" status <- fit } else if (fit$return_flag == "ILLCONDITIONED") { coefficients <- c(0) - names(coefficients) <- '' + names(coefficients) <- "" status <- "ILL CONDITIONED, CANNOT FIT" ParallelLogger::logWarn(paste("GLM fitting issue: ", status)) } else if (fit$return_flag == "MAX_ITERATIONS") { coefficients <- c(0) - names(coefficients) <- '' + names(coefficients) <- "" status <- "REACHED MAXIMUM NUMBER OF ITERATIONS, CANNOT FIT" ParallelLogger::logWarn(paste("GLM fitting issue: ", status)) } else { @@ -368,12 +369,12 @@ createCyclopsModel <- function(fit, modelType, useCrossValidation, cyclopsData, coefficients <- stats::coef(fit) # not sure this is stats?? ParallelLogger::logInfo(paste("GLM fit status: ", status)) } - + # use a dataframe for the coefficients betas <- as.numeric(coefficients) betaNames <- names(coefficients) - coefficients <- data.frame(betas=betas, covariateIds=betaNames) - + coefficients <- data.frame(betas = betas, covariateIds = betaNames) + outcomeModel <- list( priorVariance = fit$variance, log_likelihood = fit$log_likelihood, @@ -382,130 +383,142 @@ createCyclopsModel <- function(fit, modelType, useCrossValidation, cyclopsData, coefficients = coefficients ) - if(modelType == "cox" || modelType == "survival") { - baselineSurvival <- tryCatch({survival::survfit(fit, type = "aalen")}, - error = function(e) {ParallelLogger::logInfo(e); return(NULL)}) - if(is.null(baselineSurvival)){ - ParallelLogger::logInfo('No baseline hazard function returned') + if (modelType == "cox" || modelType == "survival") { + baselineSurvival <- tryCatch( + { + survival::survfit(fit, type = "aalen") + }, + error = function(e) { + ParallelLogger::logInfo(e) + return(NULL) + } + ) + if (is.null(baselineSurvival)) { + ParallelLogger::logInfo("No baseline hazard function returned") } outcomeModel$baselineSurvival <- baselineSurvival } class(outcomeModel) <- "plpModel" - - #get CV - added && status == "OK" to only run if the model fit sucsessfully - if(modelType == "logistic" && useCrossValidation && status == "OK"){ - outcomeModel$cv <- getCV(cyclopsData, labels, cvVariance = fit$variance, folds = folds, - priorType = priorType) + + # get CV - added && status == "OK" to only run if the model fit sucsessfully + if (modelType == "logistic" && useCrossValidation && status == "OK") { + outcomeModel$cv <- getCV(cyclopsData, labels, + cvVariance = fit$variance, folds = folds, + priorType = priorType + ) } - - return(outcomeModel) + return(outcomeModel) } -modelTypeToCyclopsModelType <- function(modelType, stratified=F) { +modelTypeToCyclopsModelType <- function(modelType, stratified = F) { if (modelType == "logistic") { - if (stratified) + if (stratified) { return("clr") - else + } else { return("lr") + } } else if (modelType == "poisson") { - if (stratified) + if (stratified) { return("cpr") - else + } else { return("pr") + } } else if (modelType == "cox") { return("cox") } else { ParallelLogger::logError(paste("Unknown model type:", modelType)) stop() } - } getCV <- function( - cyclopsData, - labels, - cvVariance, - folds, - priorType -) -{ - fixed_prior <- Cyclops::createPrior(priorType = priorType, - variance = cvVariance, - useCrossValidation = FALSE) - + cyclopsData, + labels, + cvVariance, + folds, + priorType) { + fixed_prior <- Cyclops::createPrior( + priorType = priorType, + variance = cvVariance, + useCrossValidation = FALSE + ) + # add the index to the labels - labels <- merge(labels, folds, by = 'rowId') - + labels <- merge(labels, folds, by = "rowId") + result <- lapply(1:max(labels$index), function(i) { - hold_out <- labels$index==i + hold_out <- labels$index == i weights <- rep(1.0, Cyclops::getNumberOfRows(cyclopsData)) weights[hold_out] <- 0.0 subset_fit <- suppressWarnings(Cyclops::fitCyclopsModel(cyclopsData, prior = fixed_prior, - weights = weights)) + weights = weights + )) predict <- stats::predict(subset_fit) - + auc <- aucWithoutCi(predict[hold_out], labels$y[hold_out]) - - predCV <- cbind(labels[hold_out,], - value = predict[hold_out]) - #predCV$outcomeCount <- predCV$y - - return(list(out_sample_auc = auc, + + predCV <- cbind(labels[hold_out, ], + value = predict[hold_out] + ) + # predCV$outcomeCount <- predCV$y + + return(list( + out_sample_auc = auc, predCV = predCV, log_likelihood = subset_fit$log_likelihood, log_prior = subset_fit$log_prior, - coef = stats::coef(subset_fit))) + coef = stats::coef(subset_fit) + )) }) - - return(result) - + + return(result) } -getVariableImportance <- function(modelTrained, trainData){ +getVariableImportance <- function(modelTrained, trainData) { varImp <- data.frame( - covariateId = as.double(modelTrained$coefficients$covariateIds[modelTrained$coefficients$covariateIds!='(Intercept)']), - value = modelTrained$coefficients$betas[modelTrained$coefficients$covariateIds!='(Intercept)'] + covariateId = as.double(modelTrained$coefficients$covariateIds[modelTrained$coefficients$covariateIds != "(Intercept)"]), + value = modelTrained$coefficients$betas[modelTrained$coefficients$covariateIds != "(Intercept)"] ) -if(sum(abs(varImp$value)>0)==0){ - ParallelLogger::logWarn('No non-zero coefficients') - varImp <- NULL -} else { - ParallelLogger::logInfo('Creating variable importance data frame') - - #trainData$covariateData$varImp <- varImp - #on.exit(trainData$covariateData$varImp <- NULL, add = T) - - varImp <- trainData$covariateData$covariateRef %>% - dplyr::collect() %>% - #dplyr::left_join(trainData$covariateData$varImp) %>% - dplyr::left_join(varImp, by = 'covariateId') %>% - dplyr::mutate(covariateValue = ifelse(is.na(.data$value), 0, .data$value)) %>% - dplyr::select(-"value") %>% - dplyr::arrange(-abs(.data$covariateValue)) %>% - dplyr::collect() -} - + if (sum(abs(varImp$value) > 0) == 0) { + ParallelLogger::logWarn("No non-zero coefficients") + varImp <- NULL + } else { + ParallelLogger::logInfo("Creating variable importance data frame") + + # trainData$covariateData$varImp <- varImp + # on.exit(trainData$covariateData$varImp <- NULL, add = T) + + varImp <- trainData$covariateData$covariateRef %>% + dplyr::collect() %>% + # dplyr::left_join(trainData$covariateData$varImp) %>% + dplyr::left_join(varImp, by = "covariateId") %>% + dplyr::mutate(covariateValue = ifelse(is.na(.data$value), 0, .data$value)) %>% + dplyr::select(-"value") %>% + dplyr::arrange(-abs(.data$covariateValue)) %>% + dplyr::collect() + } + return(varImp) } -filterCovariateIds <- function(param, covariateData){ - if ( (length(param$includeCovariateIds) != 0) & (length(param$excludeCovariateIds) != 0)) { - covariates <- covariateData$covariates %>% - dplyr::filter(.data$covariateId %in% param$includeCovariateIds) %>% +filterCovariateIds <- function(param, covariateData) { + if ((length(param$includeCovariateIds) != 0) & (length(param$excludeCovariateIds) != 0)) { + covariates <- covariateData$covariates %>% + dplyr::filter(.data$covariateId %in% param$includeCovariateIds) %>% dplyr::filter(!.data$covariateId %in% param$excludeCovariateIds) # does not work - } else if ( (length(param$includeCovariateIds) == 0) & (length(param$excludeCovariateIds) != 0)) { - covariates <- covariateData$covariates %>% + } else if ((length(param$includeCovariateIds) == 0) & (length(param$excludeCovariateIds) != 0)) { + covariates <- covariateData$covariates %>% dplyr::filter(!.data$covariateId %in% param$excludeCovariateIds) # does not work - } else if ( (length(param$includeCovariateIds) != 0) & (length(param$excludeCovariateIds) == 0)) { + } else if ((length(param$includeCovariateIds) != 0) & (length(param$excludeCovariateIds) == 0)) { includeCovariateIds <- as.double(param$includeCovariateIds) # fixes odd dplyr issue with param - covariates <- covariateData$covariates %>% - dplyr::filter(.data$covariateId %in% includeCovariateIds) + covariates <- covariateData$covariates %>% + dplyr::filter(.data$covariateId %in% includeCovariateIds) } else { covariates <- covariateData$covariates } @@ -515,12 +528,12 @@ filterCovariateIds <- function(param, covariateData){ reparamTransferCoefs <- function(inCoefs) { transferCoefs <- inCoefs %>% dplyr::filter(grepl("-", .data$covariateIds)) - + transferCoefs$covariateIds <- substring(transferCoefs$covariateIds, 2) - + originalCoefs <- inCoefs %>% dplyr::filter(!grepl("-", .data$covariateIds)) - + coefs <- rbind(originalCoefs, transferCoefs) coefs <- rowsum(coefs$betas, coefs$covariateIds) coefs <- data.frame(betas = coefs, covariateIds = rownames(coefs), row.names = NULL) diff --git a/R/CyclopsSettings.R b/R/CyclopsSettings.R index 48388cab..40d0e484 100644 --- a/R/CyclopsSettings.R +++ b/R/CyclopsSettings.R @@ -3,7 +3,7 @@ #' @param variance Numeric: prior distribution starting variance #' @param seed An option to add a seed when training the model #' @param includeCovariateIds a set of covariate IDS to limit the analysis to -#' @param noShrinkage a set of covariates whcih are to be forced to be included in the final model. default is the intercept +#' @param noShrinkage a set of covariates whcih are to be forced to be included in the final model. default is the intercept #' @param threads An option to set number of threads when training model #' @param forceIntercept Logical: Force intercept coefficient into prior #' @param upperLimit Numeric: Upper prior variance limit for grid-search @@ -11,74 +11,72 @@ #' @param tolerance Numeric: maximum relative change in convergence criterion from successive iterations to achieve convergence #' @param maxIterations Integer: maximum iterations of Cyclops to attempt before returning a failed-to-converge error #' @param priorCoefs Use coefficients from a previous model as starting points for model fit (transfer learning) -#' +#' #' @examples #' model.lr <- setLassoLogisticRegression() #' @export -setLassoLogisticRegression<- function( - variance = 0.01, - seed = NULL, - includeCovariateIds = c(), - noShrinkage = c(0), - threads = -1, - forceIntercept = F, - upperLimit = 20, - lowerLimit = 0.01, - tolerance = 2e-06, - maxIterations = 3000, - priorCoefs = NULL - ){ - - checkIsClass(seed, c('numeric','NULL','integer')) - if(is.null(seed[1])){ - seed <- as.integer(sample(100000000,1)) +setLassoLogisticRegression <- function( + variance = 0.01, + seed = NULL, + includeCovariateIds = c(), + noShrinkage = c(0), + threads = -1, + forceIntercept = FALSE, + upperLimit = 20, + lowerLimit = 0.01, + tolerance = 2e-06, + maxIterations = 3000, + priorCoefs = NULL) { + checkIsClass(seed, c("numeric", "NULL", "integer")) + if (is.null(seed[1])) { + seed <- as.integer(sample(100000000, 1)) } - checkIsClass(threads, c('numeric','integer')) - checkIsClass(variance, c('numeric','integer')) + checkIsClass(threads, c("numeric", "integer")) + checkIsClass(variance, c("numeric", "integer")) checkHigherEqual(variance, 0) - - checkIsClass(lowerLimit, c('numeric','integer')) - checkIsClass(upperLimit, c('numeric','integer')) + + checkIsClass(lowerLimit, c("numeric", "integer")) + checkIsClass(upperLimit, c("numeric", "integer")) checkHigherEqual(upperLimit, lowerLimit) - + param <- list( priorParams = list( - priorType = "laplace", + priorType = "laplace", forceIntercept = forceIntercept, - variance = variance, + variance = variance, exclude = noShrinkage - ), - includeCovariateIds = includeCovariateIds, - upperLimit = upperLimit, + ), + includeCovariateIds = includeCovariateIds, + upperLimit = upperLimit, lowerLimit = lowerLimit, priorCoefs = priorCoefs - ) - - attr(param, 'settings') <- list( - priorfunction = 'Cyclops::createPrior', - selectorType = "byPid", # is this correct? - crossValidationInPrior = T, - modelType = 'logistic', - addIntercept = T, - useControl = T, + ) + + attr(param, "settings") <- list( + priorfunction = "Cyclops::createPrior", + selectorType = "byPid", # is this correct? + crossValidationInPrior = TRUE, + modelType = "logistic", + addIntercept = TRUE, + useControl = TRUE, seed = seed[1], name = "Lasso Logistic Regression", - threads = threads[1], - tolerance = tolerance[1], #2e-06 - cvRepetitions = 1, #1 - maxIterations = maxIterations[1] #3000 + threads = threads[1], + tolerance = tolerance[1], # 2e-06 + cvRepetitions = 1, # 1 + maxIterations = maxIterations[1] # 3000 ) - - attr(param, 'modelType') <- 'binary' - attr(param, 'saveType') <- 'RtoJson' - + + attr(param, "modelType") <- "binary" + attr(param, "saveType") <- "RtoJson" + result <- list( fitFunction = "fitCyclopsModel", param = param ) class(result) <- "modelSettings" - + return(result) } @@ -89,7 +87,7 @@ setLassoLogisticRegression<- function( #' @param variance Numeric: prior distribution starting variance #' @param seed An option to add a seed when training the model #' @param includeCovariateIds a set of covariate IDS to limit the analysis to -#' @param noShrinkage a set of covariates whcih are to be forced to be included in the final model. default is the intercept +#' @param noShrinkage a set of covariates whcih are to be forced to be included in the final model. default is the intercept #' @param threads An option to set number of threads when training model #' @param upperLimit Numeric: Upper prior variance limit for grid-search #' @param lowerLimit Numeric: Lower prior variance limit for grid-search @@ -100,79 +98,79 @@ setLassoLogisticRegression<- function( #' model.lr <- setCoxModel() #' @export setCoxModel <- function( - variance = 0.01, - seed = NULL, - includeCovariateIds = c(), - noShrinkage = c(), - threads = -1, - upperLimit = 20, - lowerLimit = 0.01, - tolerance = 2e-07, - maxIterations = 3000 -){ - - checkIsClass(seed, c('numeric','NULL','integer')) - if(is.null(seed[1])){ - seed <- as.integer(sample(100000000,1)) + variance = 0.01, + seed = NULL, + includeCovariateIds = c(), + noShrinkage = c(), + threads = -1, + upperLimit = 20, + lowerLimit = 0.01, + tolerance = 2e-07, + maxIterations = 3000) { + + checkSurvivalPackages() + checkIsClass(seed, c("numeric", "NULL", "integer")) + if (is.null(seed[1])) { + seed <- as.integer(sample(100000000, 1)) } - checkIsClass(threads, c('numeric','integer')) - checkIsClass(variance, c('numeric','integer')) + checkIsClass(threads, c("numeric", "integer")) + checkIsClass(variance, c("numeric", "integer")) checkHigherEqual(variance, 0) - - checkIsClass(lowerLimit, c('numeric','integer')) - checkIsClass(upperLimit, c('numeric','integer')) - + + checkIsClass(lowerLimit, c("numeric", "integer")) + checkIsClass(upperLimit, c("numeric", "integer")) + checkHigherEqual(upperLimit, lowerLimit) - - #selectorType = "byRow", + + # selectorType = "byRow", param <- list( priorParams = list( - priorType = "laplace", - variance = variance, + priorType = "laplace", + variance = variance, exclude = noShrinkage ), - includeCovariateIds = includeCovariateIds, - upperLimit = upperLimit, + includeCovariateIds = includeCovariateIds, + upperLimit = upperLimit, lowerLimit = lowerLimit ) - - attr(param, 'settings') <- list( - priorfunction = 'Cyclops::createPrior', + + attr(param, "settings") <- list( + priorfunction = "Cyclops::createPrior", selectorType = "byRow", - crossValidationInPrior = T, - modelType = 'cox', - addIntercept = F, - useControl = T, + crossValidationInPrior = TRUE, + modelType = "cox", + addIntercept = FALSE, + useControl = TRUE, seed = seed[1], name = "LASSO Cox Regression", - threads = threads[1], - tolerance = tolerance[1], #2e-07 - cvRepetitions = 1, #1 - maxIterations = maxIterations[1] #3000 + threads = threads[1], + tolerance = tolerance[1], # 2e-07 + cvRepetitions = 1, # 1 + maxIterations = maxIterations[1] # 3000 ) - - attr(param, 'modelType') <- 'survival' - attr(param, 'saveType') <- 'RtoJson' - + + attr(param, "modelType") <- "survival" + attr(param, "saveType") <- "RtoJson" + result <- list( fitFunction = "fitCyclopsModel", param = param ) class(result) <- "modelSettings" - + return(result) } #' Create setting for lasso logistic regression -#' +#' #' @param K The maximum number of non-zero predictors #' @param penalty Specifies the IHT penalty; possible values are `BIC` or `AIC` or a numeric value #' @param seed An option to add a seed when training the model #' @param exclude A vector of numbers or covariateId names to exclude from prior #' @param forceIntercept Logical: Force intercept coefficient into regularization -#' @param fitBestSubset Logical: Fit final subset with no regularization +#' @param fitBestSubset Logical: Fit final subset with no regularization #' @param initialRidgeVariance integer #' @param tolerance numeric #' @param maxIterations integer @@ -183,52 +181,56 @@ setCoxModel <- function( #' model.lr <- setLassoLogisticRegression() #' @export setIterativeHardThresholding <- function( - K = 10, - penalty = "bic", - seed = sample(100000, 1), - exclude = c(), - forceIntercept = FALSE, - fitBestSubset = FALSE, - initialRidgeVariance = 0.1, - tolerance = 1e-08, - maxIterations = 10000, - threshold = 1e-06, - delta = 0 - ) { - rlang::check_installed("IterativeHardThresholding") - - if (K < 1) + K = 10, + penalty = "bic", + seed = sample(100000, 1), + exclude = c(), + forceIntercept = FALSE, + fitBestSubset = FALSE, + initialRidgeVariance = 0.1, + tolerance = 1e-08, + maxIterations = 10000, + threshold = 1e-06, + delta = 0) { + rlang::check_installed("IterativeHardThresholding") + + if (K < 1) { stop("Invalid maximum number of predictors") - if (!(penalty %in% c("aic", "bic") || is.numeric(penalty))) + } + if (!(penalty %in% c("aic", "bic") || is.numeric(penalty))) { stop('Penalty must be "aic", "bic" or numeric') - if (!is.logical(forceIntercept)) + } + if (!is.logical(forceIntercept)) { stop("forceIntercept must be of type: logical") - if (!is.logical(fitBestSubset)) + } + if (!is.logical(fitBestSubset)) { stop("fitBestSubset must be of type: logical") - if (!inherits(x = seed, what = c("numeric", "NULL", "integer"))) + } + if (!inherits(x = seed, what = c("numeric", "NULL", "integer"))) { stop("Invalid seed") + } # set seed if (is.null(seed[1])) { seed <- as.integer(sample(100000000, 1)) } - + param <- list( priorParams = list( K = K, - penalty = penalty, + penalty = penalty, exclude = exclude, forceIntercept = forceIntercept, fitBestSubset = fitBestSubset, initialRidgeVariance = initialRidgeVariance, - tolerance = tolerance[1], - maxIterations = maxIterations[1], - threshold = threshold, + tolerance = tolerance[1], + maxIterations = maxIterations[1], + threshold = threshold, delta = delta ) ) - + attr(param, "settings") <- list( priorfunction = "IterativeHardThresholding::createIhtPrior", selectorType = "byRow", @@ -239,15 +241,15 @@ setIterativeHardThresholding <- function( seed = seed[1], name = "Iterative Hard Thresholding" ) - - attr(param, "modelType") <- "binary" + + attr(param, "modelType") <- "binary" attr(param, "saveType") <- "RtoJson" - + result <- list( fitFunction = "fitCyclopsModel", param = param ) class(result) <- "modelSettings" - + return(result) } diff --git a/R/EvaluationSummary.R b/R/EvaluationSummary.R index e35b5f1b..96cb4357 100644 --- a/R/EvaluationSummary.R +++ b/R/EvaluationSummary.R @@ -1,236 +1,246 @@ getEvaluationStatistics <- function( - prediction, - predictionType, - typeColumn = 'evaluation' -){ - + prediction, + predictionType, + typeColumn = "evaluation") { evaluation <- do.call( - what = paste0('getEvaluationStatistics_', predictionType), + what = paste0("getEvaluationStatistics_", predictionType), args = list( - prediction = prediction, + prediction = prediction, evalColumn = typeColumn, - timepoint = attr(prediction, 'metaData')$timepoint + timepoint = attr(prediction, "metaData")$timepoint ) ) - + return(evaluation) } # get all the standard metrics for a given evaluation type # function to calculate evaluation summary data.frame with columns: evaluation, metric, value -getEvaluationStatistics_binary <- function(prediction, evalColumn, ...){ - +getEvaluationStatistics_binary <- function(prediction, evalColumn, ...) { result <- c() - evalTypes <- unique(as.data.frame(prediction)[,evalColumn]) + evalTypes <- unique(as.data.frame(prediction)[, evalColumn]) - for(evalType in evalTypes){ - + for (evalType in evalTypes) { predictionOfInterest <- prediction %>% dplyr::filter(.data[[evalColumn]] == evalType) - + result <- rbind( result, - c(evalType, 'populationSize', nrow(predictionOfInterest)), - c(evalType, 'outcomeCount', sum(predictionOfInterest$outcomeCount)) + c(evalType, "populationSize", nrow(predictionOfInterest)), + c(evalType, "outcomeCount", sum(predictionOfInterest$outcomeCount)) ) - + # auc - ParallelLogger::logInfo(paste0('Calculating Performance for ', evalType)) - ParallelLogger::logInfo('=============') - - ParallelLogger::logTrace('Calculating AUC') + ParallelLogger::logInfo(paste0("Calculating Performance for ", evalType)) + ParallelLogger::logInfo("=============") + + ParallelLogger::logTrace("Calculating AUC") auc <- computeAuc(predictionOfInterest, confidenceInterval = T) - + result <- rbind( - result, - c(evalType, 'AUROC', auc[1]), - c(evalType, '95% lower AUROC', auc[2]), - c(evalType, '95% upper AUROC', auc[3]) + result, + c(evalType, "AUROC", auc[1]), + c(evalType, "95% lower AUROC", auc[2]), + c(evalType, "95% upper AUROC", auc[3]) ) - ParallelLogger::logInfo(sprintf('%-20s%.2f', 'AUC', auc[1]*100)) - ParallelLogger::logInfo(sprintf('%-20s%.2f', '95% lower AUC: ', auc[2]*100)) - ParallelLogger::logInfo(sprintf('%-20s%.2f', '95% upper AUC: ', auc[3]*100)) - + ParallelLogger::logInfo(sprintf("%-20s%.2f", "AUC", auc[1] * 100)) + ParallelLogger::logInfo(sprintf("%-20s%.2f", "95% lower AUC: ", auc[2] * 100)) + ParallelLogger::logInfo(sprintf("%-20s%.2f", "95% upper AUC: ", auc[3] * 100)) + # auprc - ParallelLogger::logTrace('Calculating AUPRC') + ParallelLogger::logTrace("Calculating AUPRC") positive <- predictionOfInterest$value[predictionOfInterest$outcomeCount == 1] negative <- predictionOfInterest$value[predictionOfInterest$outcomeCount == 0] pr <- PRROC::pr.curve(scores.class0 = positive, scores.class1 = negative) auprc <- pr$auc.integral result <- rbind( - result, - c(evalType, 'AUPRC', auprc) + result, + c(evalType, "AUPRC", auprc) ) - ParallelLogger::logInfo(sprintf('%-20s%.2f', 'AUPRC: ', auprc*100)) - + ParallelLogger::logInfo(sprintf("%-20s%.2f", "AUPRC: ", auprc * 100)) + # brier scores-returnss; brier, brierScaled - ParallelLogger::logTrace('Calculating Brier Score') + ParallelLogger::logTrace("Calculating Brier Score") brier <- brierScore(predictionOfInterest) result <- rbind( - result, - c(evalType, 'brier score', brier$brier), - c(evalType, 'brier score scaled', brier$brierScaled) + result, + c(evalType, "brier score", brier$brier), + c(evalType, "brier score scaled", brier$brierScaled) ) - ParallelLogger::logInfo(sprintf('%-20s%.2f', 'Brier: ', brier$brier)) + ParallelLogger::logInfo(sprintf("%-20s%.2f", "Brier: ", brier$brier)) + - # using rms::val.prob - indValProb <- predictionOfInterest$value>0 & predictionOfInterest$value < 1 + indValProb <- predictionOfInterest$value > 0 & predictionOfInterest$value < 1 valProb <- tryCatch( calculateEStatisticsBinary(prediction = predictionOfInterest[indValProb, ]), error = function(e) { - ParallelLogger::logInfo(e); return( + ParallelLogger::logInfo(e) + return( c( - Eavg = 0, - E90 = 0, + Eavg = 0, + E90 = 0, Emax = 0 ) ) } ) result <- rbind( - result, - c(evalType, 'Eavg', valProb['Eavg']), - c(evalType, 'E90', valProb['E90']), - c(evalType, 'Emax', valProb['Emax']) + result, + c(evalType, "Eavg", valProb["Eavg"]), + c(evalType, "E90", valProb["E90"]), + c(evalType, "Emax", valProb["Emax"]) ) - ParallelLogger::logInfo(sprintf('%-20s%.2f', 'Eavg: ', round(valProb['Eavg'], digits = 4))) + ParallelLogger::logInfo(sprintf("%-20s%.2f", "Eavg: ", round(valProb["Eavg"], digits = 4))) + + - - # Removing for now as too slow... - #ici <- ici(prediction) - #result <- rbind( - # result, + # ici <- ici(prediction) + # result <- rbind( + # result, # c(evalType, 'ici', ifelse(is.null(ici), 'NA', ici)) - #) - #ParallelLogger::logInfo(paste0('ICI ', round(ifelse(is.null(ici), 'NA', ici), digits = 4))) - - + # ) + # ParallelLogger::logInfo(paste0('ICI ', round(ifelse(is.null(ici), 'NA', ici), digits = 4))) + + # calibration linear fit- returns gradient, intercept - ParallelLogger::logTrace('Calculating Calibration-in-large') + ParallelLogger::logTrace("Calculating Calibration-in-large") calinlarge <- calibrationInLarge(predictionOfInterest) result <- rbind( - result, - c(evalType, 'calibrationInLarge mean prediction', calinlarge$meanPredictionRisk), - c(evalType, 'calibrationInLarge observed risk', calinlarge$observedRisk) + result, + c(evalType, "calibrationInLarge mean prediction", calinlarge$meanPredictionRisk), + c(evalType, "calibrationInLarge observed risk", calinlarge$observedRisk) ) - ParallelLogger::logInfo(paste0('Calibration in large- Mean predicted risk ', round(calinlarge$meanPredictionRisk, digits = 4), ' : observed risk ',round(calinlarge$observedRisk, digits = 4))) - + ParallelLogger::logInfo(paste0("Calibration in large- Mean predicted risk ", round(calinlarge$meanPredictionRisk, digits = 4), " : observed risk ", round(calinlarge$observedRisk, digits = 4))) + calinlargeInt <- calibrationInLargeIntercept(predictionOfInterest) result <- rbind( - result, - c(evalType, 'calibrationInLarge intercept', calinlargeInt) + result, + c(evalType, "calibrationInLarge intercept", calinlargeInt) ) - ParallelLogger::logInfo(paste0('Calibration in large- Intercept ', round(calinlargeInt, digits = 4))) - - - ParallelLogger::logTrace('Calculating Weak Calibration') + ParallelLogger::logInfo(paste0("Calibration in large- Intercept ", round(calinlargeInt, digits = 4))) + + + ParallelLogger::logTrace("Calculating Weak Calibration") weakCal <- calibrationWeak(predictionOfInterest) result <- rbind( - result, - c(evalType, 'weak calibration intercept', weakCal$intercept), - c(evalType, 'weak calibration gradient', weakCal$gradient) + result, + c(evalType, "weak calibration intercept", weakCal$intercept), + c(evalType, "weak calibration gradient", weakCal$gradient) ) - ParallelLogger::logInfo(paste0('Weak calibration intercept: ', - round(weakCal$intercept, digits = 4), - ' - gradient:',round(weakCal$gradient, digits = 4))) - - ParallelLogger::logTrace('Calculating Hosmer-Lemeshow Calibration Line') + ParallelLogger::logInfo(paste0( + "Weak calibration intercept: ", + round(weakCal$intercept, digits = 4), + " - gradient:", round(weakCal$gradient, digits = 4) + )) + + ParallelLogger::logTrace("Calculating Hosmer-Lemeshow Calibration Line") calLine10 <- calibrationLine(predictionOfInterest, numberOfStrata = 10) result <- rbind( - result, - c(evalType, 'Hosmer-Lemeshow calibration intercept', calLine10$lm[1]), - c(evalType, 'Hosmer-Lemeshow calibration gradient', calLine10$lm[2]) + result, + c(evalType, "Hosmer-Lemeshow calibration intercept", calLine10$lm[1]), + c(evalType, "Hosmer-Lemeshow calibration gradient", calLine10$lm[2]) ) - ParallelLogger::logInfo(sprintf('%-20s%.2f%-20s%.2f', 'Hosmer-Lemeshow calibration gradient: ', calLine10$lm[2], ' intercept: ',calLine10$lm[1])) - + ParallelLogger::logInfo(sprintf("%-20s%.2f%-20s%.2f", "Hosmer-Lemeshow calibration gradient: ", calLine10$lm[2], " intercept: ", calLine10$lm[1])) + # Extra: Average Precision aveP.val <- averagePrecision(predictionOfInterest) result <- rbind( - result, - c(evalType, 'Average Precision', aveP.val) + result, + c(evalType, "Average Precision", aveP.val) ) - ParallelLogger::logInfo(sprintf('%-20s%.2f', 'Average Precision: ', aveP.val)) - + ParallelLogger::logInfo(sprintf("%-20s%.2f", "Average Precision: ", aveP.val)) } - + result <- as.data.frame(result) - colnames(result) <- c('evaluation','metric','value') - + colnames(result) <- c("evaluation", "metric", "value") + return(result) } -getEvaluationStatistics_survival <- function(prediction, evalColumn, timepoint, ...){ - - if(is.null(prediction$survivalTime)){ - stop('No survival time column present') +getEvaluationStatistics_survival <- function(prediction, evalColumn, timepoint, ...) { + if (is.null(prediction$survivalTime)) { + stop("No survival time column present") } - + result <- c() - evalTypes <- unique(as.data.frame(prediction)[,evalColumn]) - - for(evalType in evalTypes){ - + evalTypes <- unique(as.data.frame(prediction)[, evalColumn]) + + for (evalType in evalTypes) { predictionOfInterest <- prediction %>% dplyr::filter(.data[[evalColumn]] == evalType) - + result <- rbind( result, - c(evalType, timepoint, 'populationSize', nrow(predictionOfInterest)), - c(evalType, timepoint, 'outcomeCount', sum(predictionOfInterest$outcomeCount)) + c(evalType, timepoint, "populationSize", nrow(predictionOfInterest)), + c(evalType, timepoint, "outcomeCount", sum(predictionOfInterest$outcomeCount)) ) - - #============================ - - ##timepoint <- attr(prediction, 'metaData')$timepoint #max(prediction$survivalTime) - ParallelLogger::logInfo(paste0('Evaluating survival model at time: ', timepoint, ' days')) - + + # ============================ + + ## timepoint <- attr(prediction, 'metaData')$timepoint #max(prediction$survivalTime) + ParallelLogger::logInfo(paste0("Evaluating survival model at time: ", timepoint, " days")) + t <- predictionOfInterest$survivalTime y <- ifelse(predictionOfInterest$outcomeCount > 0, 1, 0) - - S <- survival::Surv(t, y) + + S <- survival::Surv(t, y) p <- predictionOfInterest$value - - out <- tryCatch({summary(survival::survfit(survival::Surv(t, y) ~ 1), times = timepoint)}, - error = function(e){ParallelLogger::logError(e); return(NULL)}) - survVal <- 1-out$surv + + out <- tryCatch( + { + summary(survival::survfit(survival::Surv(t, y) ~ 1), times = timepoint) + }, + error = function(e) { + ParallelLogger::logError(e) + return(NULL) + } + ) + survVal <- 1 - out$surv meanSurvivalTime <- mean(t) - + result <- rbind( - result, - c(evalType, timepoint, 'Survival', survVal), - c(evalType, timepoint, 'Mean survival time', meanSurvivalTime) + result, + c(evalType, timepoint, "Survival", survVal), + c(evalType, timepoint, "Mean survival time", meanSurvivalTime) ) - + # add c-stat - ParallelLogger::logTrace('Calculating C-statistic') - - conc <- tryCatch({survival::concordance(S~p, reverse=TRUE)}, - error = function(e){ParallelLogger::logError(e); return(NULL)}) + ParallelLogger::logTrace("Calculating C-statistic") + + conc <- tryCatch( + { + survival::concordance(S ~ p, reverse = TRUE) + }, + error = function(e) { + ParallelLogger::logError(e) + return(NULL) + } + ) cStatistic <- 0 cStatistic_l95CI <- 0 cStatistic_u95CI <- 0 - - if(!is.null(conc)){ - cStatistic <- round(conc$concordance,5) + + if (!is.null(conc)) { + cStatistic <- round(conc$concordance, 5) c.se <- sqrt(conc$var) - cStatistic_l95CI <- round(conc$concordance+stats::qnorm(.025)*c.se,3) - cStatistic_u95CI <- round(conc$concordance+stats::qnorm(.975)*c.se,3) + cStatistic_l95CI <- round(conc$concordance + stats::qnorm(.025) * c.se, 3) + cStatistic_u95CI <- round(conc$concordance + stats::qnorm(.975) * c.se, 3) } result <- rbind( - result, - c(evalType, timepoint, 'C-statistic', cStatistic), - c(evalType, timepoint, 'C-statistic lower 95% CI', cStatistic_l95CI), - c(evalType, timepoint, 'C-statistic upper 95% CI', cStatistic_u95CI) + result, + c(evalType, timepoint, "C-statistic", cStatistic), + c(evalType, timepoint, "C-statistic lower 95% CI", cStatistic_l95CI), + c(evalType, timepoint, "C-statistic upper 95% CI", cStatistic_u95CI) ) - ParallelLogger::logInfo(paste0('C-statistic: ',cStatistic, ' (',cStatistic_l95CI ,'-', cStatistic_u95CI ,')')) + ParallelLogger::logInfo(paste0("C-statistic: ", cStatistic, " (", cStatistic_l95CI, "-", cStatistic_u95CI, ")")) # add e-stat .validateSurvival <- function(p, S, timepoint) { estimatedSurvival <- 1 - p notMissing <- !is.na(estimatedSurvival + S[, 1] + S[, 2]) - estimatedSurvival <- estimatedSurvival[notMissing] + estimatedSurvival <- estimatedSurvival[notMissing] S <- S[notMissing, ] .curtail <- function(x) pmin(.9999, pmax(x, .0001)) f <- polspline::hare( @@ -250,35 +260,39 @@ getEvaluationStatistics_survival <- function(prediction, evalColumn, timepoint, } w <- tryCatch( - { - .validateSurvival( - p = p, - S = S, - timepoint = timepoint - ) - }, - error = function(e){ParallelLogger::logError(e); return(NULL)} + { + .validateSurvival( + p = p, + S = S, + timepoint = timepoint + ) + }, + error = function(e) { + ParallelLogger::logError(e) + return(NULL) + } ) eStatistic <- eStatistic90 <- -1 if (!is.null(w)) { - eStatistic <- mean(abs(w$actual - w$estimatedSurvival)) + eStatistic <- mean(abs(w$actual - w$estimatedSurvival)) eStatistic90 <- stats::quantile(abs(w$actual - w$estimatedSurvival), - probs = .9, na.rm = TRUE) + probs = .9, na.rm = TRUE + ) } result <- rbind( - result, - c(evalType, timepoint, 'E-statistic', eStatistic), - c(evalType, timepoint, 'E-statistic 90%', eStatistic90) + result, + c(evalType, timepoint, "E-statistic", eStatistic), + c(evalType, timepoint, "E-statistic 90%", eStatistic90) ) - ParallelLogger::logInfo(paste0('E-statistic: ',eStatistic)) - ParallelLogger::logInfo(paste0('E-statistic 90%: ',eStatistic90)) + ParallelLogger::logInfo(paste0("E-statistic: ", eStatistic)) + ParallelLogger::logInfo(paste0("E-statistic 90%: ", eStatistic90)) } - + result <- as.data.frame(result) - colnames(result) <- c('evaluation','timepoint','metric','value') - + colnames(result) <- c("evaluation", "timepoint", "metric", "value") + return(result) } @@ -286,7 +300,7 @@ getEvaluationStatistics_survival <- function(prediction, evalColumn, timepoint, calculateEStatisticsBinary <- function(prediction) { risk <- prediction$value outcome <- prediction$outcomeCount - notna <- ! is.na(risk + outcome) + notna <- !is.na(risk + outcome) risk <- risk[notna] outcome <- outcome[notna] smoothFit <- stats::lowess(risk, outcome, iter = 0) @@ -306,9 +320,9 @@ calculateEStatisticsBinary <- function(prediction) { } -#================================== +# ================================== # Fucntions for the summary -#================================== +# ================================== #' Compute the area under the ROC curve #' @@ -321,11 +335,13 @@ calculateEStatisticsBinary <- function(prediction) { #' @param confidenceInterval Should 95 percebt confidence intervals be computed? #' #' @export -computeAuc <- function(prediction, - confidenceInterval = FALSE) { - if (attr(prediction, "metaData")$modelType != "binary") +computeAuc <- function( + prediction, + confidenceInterval = FALSE) { + if (attr(prediction, "metaData")$modelType != "binary") { stop("Computing AUC is only implemented for binary classification models") - + } + if (confidenceInterval) { return(aucWithCi(prediction = prediction$value, truth = prediction$outcomeCount)) } else { @@ -333,14 +349,14 @@ computeAuc <- function(prediction, } } -aucWithCi <- function(prediction, truth){ - auc <- pROC::auc(as.factor(truth), prediction, direction="<", quiet=TRUE) - aucci <-pROC::ci(auc) +aucWithCi <- function(prediction, truth) { + auc <- pROC::auc(as.factor(truth), prediction, direction = "<", quiet = TRUE) + aucci <- pROC::ci(auc) return(data.frame(auc = aucci[2], auc_lb95ci = aucci[1], auc_ub95ci = aucci[3])) } -aucWithoutCi <- function(prediction, truth){ - auc <- pROC::auc(as.factor(truth), prediction, direction="<", quiet=TRUE) +aucWithoutCi <- function(prediction, truth) { + auc <- pROC::auc(as.factor(truth), prediction, direction = "<", quiet = TRUE) return(as.double(auc)) } @@ -356,12 +372,11 @@ aucWithoutCi <- function(prediction, truth){ #' A list containing the brier score and the scaled brier score #' #' @export -brierScore <- function(prediction){ - - brier <- sum((prediction$outcomeCount -prediction$value)^2)/nrow(prediction) - brierMax <- mean(prediction$value)*(1-mean(prediction$value)) - brierScaled <- 1-brier/brierMax - return(list(brier=brier,brierScaled=brierScaled)) +brierScore <- function(prediction) { + brier <- sum((prediction$outcomeCount - prediction$value)^2) / nrow(prediction) + brierMax <- mean(prediction$value) * (1 - mean(prediction$value)) + brierScaled <- 1 - brier / brierMax + return(list(brier = brier, brierScaled = brierScaled)) } #' calibrationLine @@ -373,60 +388,64 @@ brierScore <- function(prediction){ #' Calculates the calibration from prediction object #' #' @export -calibrationLine <- function(prediction,numberOfStrata=10){ +calibrationLine <- function(prediction, numberOfStrata = 10) { outPpl <- unique(prediction$rowId) - - q <- unique(stats::quantile(prediction$value, c((1:(numberOfStrata - 1))/numberOfStrata, 1))) - - if(length(unique(c(0,q)))==2){ - warning('Prediction not spread') - #res <- c(0,0) - #lmData <- NULL - #hosmerlemeshow <- c(0,0,0) + + q <- unique(stats::quantile(prediction$value, c((1:(numberOfStrata - 1)) / numberOfStrata, 1))) + + if (length(unique(c(0, q))) == 2) { + warning("Prediction not spread") + # res <- c(0,0) + # lmData <- NULL + # hosmerlemeshow <- c(0,0,0) prediction$strata <- cut(prediction$value, - breaks = c(-0.1,0.5,1), #,max(prediction$value)), - labels = FALSE) + breaks = c(-0.1, 0.5, 1), # ,max(prediction$value)), + labels = FALSE + ) } else { prediction$strata <- cut(prediction$value, - breaks = unique(c(-0.1,q)), #,max(prediction$value)), - labels = FALSE) + breaks = unique(c(-0.1, q)), # ,max(prediction$value)), + labels = FALSE + ) } - + # get observed events: - obs.Points <- stats::aggregate(prediction$outcomeCount, by=list(prediction$strata), FUN=mean) - colnames(obs.Points) <- c('group','obs') - pred.Points <- stats::aggregate(prediction$value, by=list(prediction$strata), FUN=mean) - colnames(pred.Points) <- c('group','pred') - + obs.Points <- stats::aggregate(prediction$outcomeCount, by = list(prediction$strata), FUN = mean) + colnames(obs.Points) <- c("group", "obs") + pred.Points <- stats::aggregate(prediction$value, by = list(prediction$strata), FUN = mean) + colnames(pred.Points) <- c("group", "pred") + # hosmer-lemeshow-goodness-of-fit-test - obs.count <- stats::aggregate(prediction$outcomeCount, by=list(prediction$strata), FUN=sum) - colnames(obs.count) <- c('group','observed') - expected.count <- stats::aggregate(prediction$value, by=list(prediction$strata), FUN=sum) - colnames(expected.count) <- c('group','expected') - hoslem <- merge(obs.count, expected.count, by='group') - obs.count2 <- stats::aggregate(1-prediction$outcomeCount, by=list(prediction$strata), FUN=sum) - colnames(obs.count2) <- c('group','observed') - expected.count2 <- stats::aggregate(1-prediction$value, by=list(prediction$strata), FUN=sum) - colnames(expected.count2) <- c('group','expected') - nhoslem <- merge(obs.count2, expected.count2, by='group') - Xsquared <- sum((hoslem$observed-hoslem$expected)^2/hoslem$expected) + - sum((nhoslem$observed-nhoslem$expected)^2/nhoslem$expected) - pvalue <- stats::pchisq(Xsquared, df=numberOfStrata-2, lower.tail = F) - hosmerlemeshow <- data.frame(Xsquared=Xsquared, df=numberOfStrata-2, pvalue=pvalue) - + obs.count <- stats::aggregate(prediction$outcomeCount, by = list(prediction$strata), FUN = sum) + colnames(obs.count) <- c("group", "observed") + expected.count <- stats::aggregate(prediction$value, by = list(prediction$strata), FUN = sum) + colnames(expected.count) <- c("group", "expected") + hoslem <- merge(obs.count, expected.count, by = "group") + obs.count2 <- stats::aggregate(1 - prediction$outcomeCount, by = list(prediction$strata), FUN = sum) + colnames(obs.count2) <- c("group", "observed") + expected.count2 <- stats::aggregate(1 - prediction$value, by = list(prediction$strata), FUN = sum) + colnames(expected.count2) <- c("group", "expected") + nhoslem <- merge(obs.count2, expected.count2, by = "group") + Xsquared <- sum((hoslem$observed - hoslem$expected)^2 / hoslem$expected) + + sum((nhoslem$observed - nhoslem$expected)^2 / nhoslem$expected) + pvalue <- stats::pchisq(Xsquared, df = numberOfStrata - 2, lower.tail = F) + hosmerlemeshow <- data.frame(Xsquared = Xsquared, df = numberOfStrata - 2, pvalue = pvalue) + # linear model fitting obs to pred: - lmData <- merge(obs.Points, pred.Points, by='group') - model <- stats::lm(obs ~pred, data=lmData) - - ##graphics::plot(lmData$pred, lmData$obs) - ##graphics::abline(a = model$coefficients[1], b = model$coefficients[2], col='red') + lmData <- merge(obs.Points, pred.Points, by = "group") + model <- stats::lm(obs ~ pred, data = lmData) + + ## graphics::plot(lmData$pred, lmData$obs) + ## graphics::abline(a = model$coefficients[1], b = model$coefficients[2], col='red') res <- model$coefficients - names(res) <- c('Intercept','Gradient') + names(res) <- c("Intercept", "Gradient") # - - result <- list(lm=res, + + result <- list( + lm = res, aggregateLmData = lmData, - hosmerlemeshow = hosmerlemeshow) + hosmerlemeshow = hosmerlemeshow + ) return(result) } @@ -441,13 +460,13 @@ calibrationLine <- function(prediction,numberOfStrata=10){ #' The average precision #' #' @export -averagePrecision <- function(prediction){ +averagePrecision <- function(prediction) { lab.order <- prediction$outcomeCount[order(-prediction$value)] n <- nrow(prediction) - P <- sum(prediction$outcomeCount>0) + P <- sum(prediction$outcomeCount > 0) val <- rep(0, n) - val[lab.order>0] <- 1:P - return(sum(val/(1:n))/P) + val[lab.order > 0] <- 1:P + return(sum(val / (1:n)) / P) } @@ -456,90 +475,106 @@ averagePrecision <- function(prediction){ #' @return data.frame with meanPredictionRisk, observedRisk, and N #' @keywords internal calibrationInLarge <- function(prediction) { - - result <- data.frame(meanPredictionRisk = mean(prediction$value), + result <- data.frame( + meanPredictionRisk = mean(prediction$value), observedRisk = sum(prediction$outcomeCount) / nrow(prediction), N = nrow(prediction) ) - + return(result) } -calibrationInLargeIntercept <- function(prediction){ - - #do invert of log function: +calibrationInLargeIntercept <- function(prediction) { + # do invert of log function: # log(p/(1-p)) - + # edit the 0 and 1 values - prediction$value[prediction$value==0] <- 0.000000000000001 - prediction$value[prediction$value==1] <- 1-0.000000000000001 - - inverseLog <- log(prediction$value/(1-prediction$value)) - y <- ifelse(prediction$outcomeCount>0, 1, 0) - - intercept <- suppressWarnings(stats::glm(y ~ offset(1*inverseLog), family = 'binomial')) + prediction$value[prediction$value == 0] <- 0.000000000000001 + prediction$value[prediction$value == 1] <- 1 - 0.000000000000001 + + inverseLog <- log(prediction$value / (1 - prediction$value)) + y <- ifelse(prediction$outcomeCount > 0, 1, 0) + + intercept <- suppressWarnings(stats::glm(y ~ offset(1 * inverseLog), family = "binomial")) intercept <- intercept$coefficients[1] - + return(intercept) } -calibrationWeak <- function(prediction){ - - #do invert of log function: +calibrationWeak <- function(prediction) { + # do invert of log function: # log(p/(1-p)) - + # edit the 0 and 1 values - prediction$value[prediction$value==0] <- 0.000000000000001 - prediction$value[prediction$value==1] <- 1-0.000000000000001 - - inverseLog <- log(prediction$value/(1-prediction$value)) - y <- ifelse(prediction$outcomeCount>0, 1, 0) - - #intercept <- suppressWarnings(stats::glm(y ~ offset(1*inverseLog), family = 'binomial')) - #intercept <- intercept$coefficients[1] - #gradient <- suppressWarnings(stats::glm(y ~ inverseLog+0, family = 'binomial', + prediction$value[prediction$value == 0] <- 0.000000000000001 + prediction$value[prediction$value == 1] <- 1 - 0.000000000000001 + + inverseLog <- log(prediction$value / (1 - prediction$value)) + y <- ifelse(prediction$outcomeCount > 0, 1, 0) + + # intercept <- suppressWarnings(stats::glm(y ~ offset(1*inverseLog), family = 'binomial')) + # intercept <- intercept$coefficients[1] + # gradient <- suppressWarnings(stats::glm(y ~ inverseLog+0, family = 'binomial', # offset = rep(intercept,length(inverseLog)))) - #gradient <- gradient$coefficients[1] - - vals <- suppressWarnings(stats::glm(y ~ inverseLog, family = 'binomial')) - - result <- data.frame(intercept = vals$coefficients[1], - gradient = vals$coefficients[2]) - + # gradient <- gradient$coefficients[1] + + vals <- suppressWarnings(stats::glm(y ~ inverseLog, family = "binomial")) + + result <- data.frame( + intercept = vals$coefficients[1], + gradient = vals$coefficients[2] + ) + return(result) } #' Calculate the Integrated Calibration Information from Austin and Steyerberg #' https://onlinelibrary.wiley.com/doi/full/10.1002/sim.8281 -#' +#' #' @details #' Calculate the Integrated Calibration Information #' #' @param prediction the prediction object found in the plpResult object -#' +#' #' @return #' Integrated Calibration Information #' #' @export -ici <- function(prediction){ - #remove na - if(sum(!is.finite(prediction$value))>0){ - prediction <- prediction[is.finite(prediction$value),] +ici <- function(prediction) { + # remove na + if (sum(!is.finite(prediction$value)) > 0) { + prediction <- prediction[is.finite(prediction$value), ] } - loess.calibrate <- tryCatch({stats::loess(prediction$outcomeCount ~ prediction$value)}, - warning = function(w){ParallelLogger::logInfo(w)}, - error = function(e){ParallelLogger::logInfo(e); return(NULL)} + loess.calibrate <- tryCatch( + { + stats::loess(prediction$outcomeCount ~ prediction$value) + }, + warning = function(w) { + ParallelLogger::logInfo(w) + }, + error = function(e) { + ParallelLogger::logInfo(e) + return(NULL) + } ) - if(!is.null(loess.calibrate)){ + if (!is.null(loess.calibrate)) { # Estimate loess-based smoothed calibration curve - P.calibrate <- tryCatch({stats::predict(loess.calibrate, newdata = prediction$value)}, - warning = function(w){ParallelLogger::logInfo(w)}, - error = function(e){ParallelLogger::logInfo(e); return(NULL)} + P.calibrate <- tryCatch( + { + stats::predict(loess.calibrate, newdata = prediction$value) + }, + warning = function(w) { + ParallelLogger::logInfo(w) + }, + error = function(e) { + ParallelLogger::logInfo(e) + return(NULL) + } ) - if(!is.null(P.calibrate)){ + if (!is.null(P.calibrate)) { # This is the point on the loess calibration curve corresponding to a given predicted probability. - ICI <- mean (abs(P.calibrate - prediction$value)) + ICI <- mean(abs(P.calibrate - prediction$value)) return(ICI) } } diff --git a/R/HelperFunctions.R b/R/HelperFunctions.R index be3aa989..3dc23ad4 100644 --- a/R/HelperFunctions.R +++ b/R/HelperFunctions.R @@ -1,20 +1,25 @@ -removeInvalidString <- function(string){ - modString <- gsub('_', ' ', string) - modString <- gsub('\\.', ' ', modString) +removeInvalidString <- function(string) { + modString <- gsub("_", " ", string) + modString <- gsub("\\.", " ", modString) modString <- gsub("[[:punct:]]", "", modString) - modString <- gsub(' ', '_', modString) + modString <- gsub(" ", "_", modString) return(modString) } +#' Check if the required packages for survival analysis are installed +checkSurvivalPackages <- function() { + rlang::check_installed("survival", "polspline", "survminer", + reason = "Please install the required packages for survival analysis" + ) +} #' Create a temporary model location -#' +#' #' @export -createTempModelLoc <- function(){ - repeat{ - ##loc <- paste(tempdir(), paste0('python_models_',sample(10002323,1)), sep = '\\') - loc <- file.path(tempdir(), paste0('python_models_',sample(10002323,1))) - if(!dir.exists(loc)){ +createTempModelLoc <- function() { + repeat { + loc <- file.path(tempdir(), paste0("python_models_", sample(10002323, 1))) + if (!dir.exists(loc)) { return(loc) } } @@ -24,19 +29,19 @@ createTempModelLoc <- function(){ #' #' @details #' This function joins two lists -#' @param a A list +#' @param a A list #' @param b Another list #' #' @export -listAppend <- function(a, b){ +listAppend <- function(a, b) { size <- length(a) + length(b) x <- list() length(x) <- size - for(i in 1:size){ - if(i<=length(a)){ + for (i in 1:size) { + if (i <= length(a)) { x[[i]] <- a[[i]] - } else{ - x[[i]] <- b[[i-length(a)]] + } else { + x[[i]] <- b[[i - length(a)]] } } names(x) <- c(names(a), names(b)) @@ -44,107 +49,111 @@ listAppend <- function(a, b){ } -#' Sets up a virtual environment to use for PLP (can be conda or python) +#' Sets up a virtual environment to use for PLP (can be conda or python) #' #' @details #' This function creates a virtual environment that can be used by PatientLevelPrediction #' and installs all the required package dependancies. If using python, pip must be set up. #' -#' @param envname A string for the name of the virtual environment (default is 'PLP') -#' @param envtype An option for specifying the environment as'conda' or 'python'. If NULL then the default is 'conda' for windows users and 'python' for non-windows users +#' @param envname A string for the name of the virtual environment (default is 'PLP') +#' @param envtype An option for specifying the environment as'conda' or 'python'. If NULL then the default is 'conda' for windows users and 'python' for non-windows users #' @param condaPythonVersion String, Python version to use when creating a conda environment #' #' @export -configurePython <- function(envname='PLP', envtype=NULL, condaPythonVersion="3.11"){ - - if(is.null(envtype)){ - if(getOs()=='windows'){ +configurePython <- function(envname = "PLP", envtype = NULL, condaPythonVersion = "3.11") { + if (is.null(envtype)) { + if (getOs() == "windows") { envtype <- "conda" } else { envtype <- "python" } } - - if(envtype=='conda'){ + + if (envtype == "conda") { pEnvironments <- reticulate::conda_list() - if(length(pEnvironments) > 0 && envname %in% pEnvironments$name){ - location <- '' - warning(paste0('Conda environment ', envname,' exists. You can use reticulate::conda_remove() to remove if you want to fresh config')) + if (length(pEnvironments) > 0 && envname %in% pEnvironments$name) { + location <- "" + warning(paste0("Conda environment ", envname, " exists. You can use reticulate::conda_remove() to remove if you want to fresh config")) } else { - ParallelLogger::logInfo(paste0('Creating virtual conda environment called ', envname)) - location <- reticulate::conda_create(envname=envname, packages = paste0("python==", condaPythonVersion), conda = "auto") + ParallelLogger::logInfo(paste0("Creating virtual conda environment called ", envname)) + location <- reticulate::conda_create(envname = envname, packages = paste0("python==", condaPythonVersion), conda = "auto") } - packages <- c('numpy','scipy','scikit-learn', 'pandas','pydotplus','joblib') - ParallelLogger::logInfo(paste0('Adding python dependancies to ', envname)) - reticulate::conda_install(envname=envname, packages = packages, forge = TRUE, pip = FALSE, - pip_ignore_installed = TRUE, conda = "auto") + packages <- c("numpy", "scipy", "scikit-learn", "pandas", "pydotplus", "joblib") + ParallelLogger::logInfo(paste0("Adding python dependancies to ", envname)) + reticulate::conda_install( + envname = envname, packages = packages, forge = TRUE, pip = FALSE, + pip_ignore_installed = TRUE, conda = "auto" + ) } else { pEnvironments <- reticulate::virtualenv_list() - if(length(pEnvironments) > 0 && envname %in% pEnvironments){ - location <- '' - warning(paste0('Python environment ', envname,' exists.')) + if (length(pEnvironments) > 0 && envname %in% pEnvironments) { + location <- "" + warning(paste0("Python environment ", envname, " exists.")) } else { - ParallelLogger::logInfo(paste0('Creating virtual python environment called ', envname)) - location <- reticulate::virtualenv_create(envname=envname) + ParallelLogger::logInfo(paste0("Creating virtual python environment called ", envname)) + location <- reticulate::virtualenv_create(envname = envname) } - packages <- c('numpy', 'scikit-learn','scipy', 'pandas','pydotplus') - ParallelLogger::logInfo(paste0('Adding python dependancies to ', envname)) - reticulate::virtualenv_install(envname=envname, packages = packages, - ignore_installed = TRUE) + packages <- c("numpy", "scikit-learn", "scipy", "pandas", "pydotplus") + ParallelLogger::logInfo(paste0("Adding python dependancies to ", envname)) + reticulate::virtualenv_install( + envname = envname, packages = packages, + ignore_installed = TRUE + ) } - + return(invisible(location)) } #' Use the virtual environment created using configurePython() #' #' @details -#' This function sets PatientLevelPrediction to use a virtual environment +#' This function sets PatientLevelPrediction to use a virtual environment #' -#' @param envname A string for the name of the virtual environment (default is 'PLP') -#' @param envtype An option for specifying the environment as'conda' or 'python'. If NULL then the default is 'conda' for windows users and 'python' for non-windows users +#' @param envname A string for the name of the virtual environment (default is 'PLP') +#' @param envtype An option for specifying the environment as'conda' or 'python'. If NULL then the default is 'conda' for windows users and 'python' for non-windows users #' #' @export -setPythonEnvironment <- function(envname='PLP', envtype=NULL){ - - if(is.null(envtype)){ - if(getOs()=='windows'){ - envtype=='conda' +setPythonEnvironment <- function(envname = "PLP", envtype = NULL) { + if (is.null(envtype)) { + if (getOs() == "windows") { + envtype == "conda" } else { - envtype=='python' + envtype == "python" } } - - if(envtype=='conda'){ + + if (envtype == "conda") { pEnvironments <- reticulate::conda_list() - if(!envname%in%pEnvironments$name){ - return(paste0('Conda environment ', envname,' not found. Please set up using configurePython()')) + if (!envname %in% pEnvironments$name) { + return(paste0("Conda environment ", envname, " not found. Please set up using configurePython()")) } reticulate::use_condaenv(envname) - return(paste0('Using conda environment ',envname)) + return(paste0("Using conda environment ", envname)) } else { pEnvironments <- reticulate::virtualenv_list() - if(!envname%in%pEnvironments$name){ - return(paste0('Python environment ', envname,' not found. Please set up using configurePython()')) + if (!envname %in% pEnvironments$name) { + return(paste0("Python environment ", envname, " not found. Please set up using configurePython()")) } reticulate::use_virtualenv(envname) - return(paste0('Using python environment ',envname)) + return(paste0("Using python environment ", envname)) } - } -getOs <- function(){ +getOs <- function() { sysinf <- Sys.info() - if (!is.null(sysinf)){ - os <- sysinf['sysname'] - if (os == 'Darwin') + if (!is.null(sysinf)) { + os <- sysinf["sysname"] + if (os == "Darwin") { os <- "osx" + } } else { ## mystery machine os <- .Platform$OS.type - if (grepl("^darwin", R.version$os)) + if (grepl("^darwin", R.version$os)) { os <- "osx" - if (grepl("linux-gnu", R.version$os)) + } + if (grepl("linux-gnu", R.version$os)) { os <- "linux" + } } tolower(os) } @@ -152,96 +161,112 @@ getOs <- function(){ # Borrowed and adapted from Hmisc: https://github.com/harrelfe/Hmisc/blob/39011dae3af3c943e67401ed6000644014707e8b/R/cut2.s cut2 <- function(x, g, m = 150, digits = 3) { - method <- 1 ## 20may02 x.unique <- sort(unique(c(x[!is.na(x)]))) - min.dif <- min(diff(x.unique))/2 + min.dif <- min(diff(x.unique)) / 2 min.dif.factor <- 1 - oldopt <- options('digits') - options(digits=digits) + oldopt <- options("digits") + options(digits = digits) on.exit(options(oldopt)) - xlab <- attr(x, 'label') + xlab <- attr(x, "label") nnm <- sum(!is.na(x)) - if(missing(g)) g <- max(1,floor(nnm/m)) - if(g < 1) - stop('g must be >=1, m must be positive') + if (missing(g)) g <- max(1, floor(nnm / m)) + if (g < 1) { + stop("g must be >=1, m must be positive") + } - options(digits=15) + options(digits = 15) n <- table(x) xx <- as.double(names(n)) options(digits = digits) cum <- cumsum(n) m <- length(xx) - y <- as.integer(ifelse(is.na(x),NA,1)) + y <- as.integer(ifelse(is.na(x), NA, 1)) labs <- character(g) - cuts <- stats::approx(cum, xx, xout=(1:g)*nnm/g, - method='constant', rule=2, f=1)$y + cuts <- stats::approx(cum, xx, + xout = (1:g) * nnm / g, + method = "constant", rule = 2, f = 1 + )$y cuts[length(cuts)] <- max(xx) lower <- xx[1] upper <- 1e45 up <- low <- double(g) i <- 0 - for(j in 1:g) { - cj <- if(method==1 || j==1) cuts[j] else { - if(i==0) - stop('program logic error') + for (j in 1:g) { + cj <- if (method == 1 || j == 1) { + cuts[j] + } else { + if (i == 0) { + stop("program logic error") + } # Not used unique values found in table(x) - s <- if(is.na(lower)) FALSE else xx >= lower - cum.used <- if(all(s)) 0 else max(cum[!s]) - if(j==m) max(xx) else if(sum(s)<2) max(xx) else - stats::approx(cum[s]-cum.used, xx[s], xout=(nnm-cum.used)/(g-j+1), - method='constant', rule=2, f=1)$y + s <- if (is.na(lower)) FALSE else xx >= lower + cum.used <- if (all(s)) 0 else max(cum[!s]) + if (j == m) { + max(xx) + } else if (sum(s) < 2) { + max(xx) + } else { + stats::approx(cum[s] - cum.used, xx[s], + xout = (nnm - cum.used) / (g - j + 1), + method = "constant", rule = 2, f = 1 + )$y + } } - - if(cj==upper) next - + + if (cj == upper) next + i <- i + 1 upper <- cj # assign elements to group i # y contains the group number in the end - y[x >= (lower-min.dif.factor*min.dif)] <- i + y[x >= (lower - min.dif.factor * min.dif)] <- i low[i] <- lower - lower <- if(j==g) upper else min(xx[xx > upper]) - - if(is.na(lower)) lower <- upper - - up[i] <- lower + lower <- if (j == g) upper else min(xx[xx > upper]) + + if (is.na(lower)) lower <- upper + + up[i] <- lower } - - low <- low[1:i] - up <- up[1:i] + + low <- low[1:i] + up <- up[1:i] # Are the bounds different? variation <- logical(i) - for(ii in 1:i) { - r <- range(x[y==ii], na.rm=TRUE) + for (ii in 1:i) { + r <- range(x[y == ii], na.rm = TRUE) variation[ii] <- diff(r) > 0 } - flow <- do.call(format,c(list(low), digits = 3)) - fup <- do.call(format,c(list(up), digits = 3)) - bb <- c(rep(')',i-1),']') - labs <- ifelse(low==up | (!variation), flow, - paste('[',flow,',',fup,bb,sep='')) - ss <- y==0 & !is.na(y) - if(any(ss)) - stop(paste('categorization error in cut2. Values of x not appearing in any interval:\n', - paste(format(x[ss],digits=12),collapse=' '), - '\nLower endpoints:', - paste(format(low,digits=12), collapse=' '), - '\nUpper endpoints:', - paste(format(up,digits=12),collapse=' '))) - - y <- structure(y, class='factor', levels=labs) - - attr(y,'class') <- "factor" - if(length(xlab)){ - #label(y) <- xlab # what is label? + flow <- do.call(format, c(list(low), digits = 3)) + fup <- do.call(format, c(list(up), digits = 3)) + bb <- c(rep(")", i - 1), "]") + labs <- ifelse(low == up | (!variation), flow, + paste("[", flow, ",", fup, bb, sep = "") + ) + ss <- y == 0 & !is.na(y) + if (any(ss)) { + stop(paste( + "categorization error in cut2. Values of x not appearing in any interval:\n", + paste(format(x[ss], digits = 12), collapse = " "), + "\nLower endpoints:", + paste(format(low, digits = 12), collapse = " "), + "\nUpper endpoints:", + paste(format(up, digits = 12), collapse = " ") + )) + } + + y <- structure(y, class = "factor", levels = labs) + + attr(y, "class") <- "factor" + if (length(xlab)) { + # label(y) <- xlab # what is label? # think the below does the same as the line above - class(y) <- 'labelled' - attr(y, 'label') <- xlab + class(y) <- "labelled" + attr(y, "label") <- xlab } return(y)