diff --git a/R/ParamChecks.R b/R/ParamChecks.R index 7b8ef29e..be424c42 100644 --- a/R/ParamChecks.R +++ b/R/ParamChecks.R @@ -99,3 +99,12 @@ checkColumnNames <- function(parameter, columnNames) { } return(TRUE) } + +checkIsEqual <- function(parameter, value) { + name <- deparse(substitute(parameter)) + if (!identical(parameter, value)) { + ParallelLogger::logError(paste0(name, " should be equal to ", value)) + stop(paste0(name, " is not equal to ", value)) + } + return(TRUE) +} diff --git a/R/SklearnClassifier.R b/R/SklearnClassifier.R index 850e7b13..df45f866 100644 --- a/R/SklearnClassifier.R +++ b/R/SklearnClassifier.R @@ -22,43 +22,36 @@ fitSklearn <- function(trainData, analysisId, ...) { param <- modelSettings$param - + # check covariate data if (!FeatureExtraction::isCovariateData(trainData$covariateData)) { stop("Needs correct covariateData") } - + # get the settings from the param - pySettings <- attr(param, 'settings') - + pySettings <- attr(param, "settings") + # make sure the inputs are valid checkPySettings(pySettings) - + start <- Sys.time() - + if (!is.null(trainData$folds)) { trainData$labels <- - merge(trainData$labels, trainData$fold, by = 'rowId') + merge(trainData$labels, trainData$fold, by = "rowId") } - + # convert the data to a sparse R Matrix and then use reticulate to convert to python sparse # need to save the covariateMap so we know the covariateId to columnId when applying model mappedData <- toSparseM(trainData) - + matrixData <- mappedData$dataMatrix labels <- mappedData$labels covariateRef <- mappedData$covariateRef - + # save the model to outLoc outLoc <- createTempModelLoc() - - # functions does CV and fits final models - # returns: prediction (Train/CV), - # finalParam (optimal hyper-parameters) - # variableImportance (final model) - # paramGridSearch list with performance and params for complete grid search - # at the moment it uses AUC as performance but this could be modified to let user - # specify the performance metric to optimise + cvResult <- do.call( what = gridCvPython, args = list( @@ -71,38 +64,38 @@ fitSklearn <- function(trainData, pythonClass = pySettings$pythonClass, modelLocation = outLoc, paramSearch = param, - saveToJson = attr(param, 'saveToJson') + saveToJson = attr(param, "saveToJson") ) ) - + hyperSummary <- - do.call(rbind, - lapply(cvResult$paramGridSearch, function(x) - x$hyperSummary)) - + do.call( + rbind, + lapply(cvResult$paramGridSearch, function(x) { + x$hyperSummary + }) + ) + prediction <- cvResult$prediction - + variableImportance <- cvResult$variableImportance variableImportance[is.na(variableImportance)] <- 0 - + incs <- rep(1, nrow(covariateRef)) covariateRef$included <- incs covariateRef$covariateValue <- unlist(variableImportance) # check this is correct order - + comp <- start - Sys.time() - + result <- list( model = file.path(outLoc), - preprocessing = list( featureEngineering = attr(trainData, "metaData")$featureEngineering, tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, - requireDenseMatrix = attr(param, 'settings')$requiresDenseMatrix + requireDenseMatrix = attr(param, "settings")$requiresDenseMatrix ), - prediction = prediction, - modelDesign = PatientLevelPrediction::createModelDesign( targetId = attr(trainData, "metaData")$targetId, outcomeId = attr(trainData, "metaData")$outcomeId, @@ -115,32 +108,30 @@ fitSklearn <- function(trainData, splitSettings = attr(trainData, "metaData")$splitSettings, sampleSettings = attr(trainData, "metaData")$sampleSettings ), - trainDetails = list( analysisId = analysisId, - analysisSource = '', - #TODO add from model + analysisSource = "", + # TODO add from model developmentDatabase = attr(trainData, "metaData")$cdmDatabaseName, developmentDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema, attrition = attr(trainData, "metaData")$attrition, - trainingTime = paste(as.character(abs(comp)), attr(comp, 'units')), + trainingTime = paste(as.character(abs(comp)), attr(comp, "units")), trainingDate = Sys.Date(), modelName = pySettings$name, finalModelParameters = cvResult$finalParam, hyperParamSearch = hyperSummary ), - covariateImportance = covariateRef ) - + class(result) <- "plpModel" attr(result, "predictionFunction") <- "predictPythonSklearn" attr(result, "modelType") <- "binary" attr(result, "saveType") <- - attr(param, 'saveType') # in save/load plp + attr(param, "saveType") # in save/load plp attr(result, "saveToJson") <- - attr(param, 'saveToJson') # when saving in reticulate - + attr(param, "saveToJson") # when saving in reticulate + return(result) } @@ -148,7 +139,7 @@ fitSklearn <- function(trainData, predictPythonSklearn <- function(plpModel, data, cohort) { - if (inherits(data, 'plpData')) { + if (inherits(data, "plpData")) { # convert matrixObjects <- toSparseM( plpData = data, @@ -156,74 +147,73 @@ predictPythonSklearn <- function(plpModel, map = plpModel$covariateImportance %>% dplyr::select("columnId", "covariateId") ) - + newData <- matrixObjects$dataMatrix cohort <- matrixObjects$labels - - } else{ + } else { newData <- data } - + # load model - if (attr(plpModel, 'saveToJson')) { + if (attr(plpModel, "saveToJson")) { modelLocation <- reticulate::r_to_py(file.path(plpModel$model, "model.json")) model <- sklearnFromJson(path = modelLocation) - } else{ - os <- reticulate::import('os') - joblib <- reticulate::import('joblib', convert = FALSE) + } else { + os <- reticulate::import("os") + joblib <- reticulate::import("joblib", convert = FALSE) modelLocation <- reticulate::r_to_py(file.path(plpModel$model, "model.pkl")) model <- joblib$load(os$path$join(modelLocation)) } included <- plpModel$covariateImportance$columnId[plpModel$covariateImportance$included > - 0] # does this include map? - pythonData <- reticulate::r_to_py(newData[, included, drop = F]) - + 0] # does this include map? + pythonData <- reticulate::r_to_py(newData[, included, drop = FALSE]) + # make dense if needed if (plpModel$preprocessing$requireDenseMatrix) { pythonData <- pythonData$toarray() } - + cohort <- predictValues( model = model, data = pythonData, cohort = cohort, - type = attr(plpModel, 'modelType') + type = attr(plpModel, "modelType") ) - + return(cohort) } -predictValues <- function(model, data, cohort, type = 'binary') { - predictionValue <- model$predict_proba(data) +predictValues <- function(model, data, cohort, type = "binary") { + predictionValue <- model$predict_proba(data) cohort$value <- reticulate::py_to_r(predictionValue)[, 2] - + cohort <- cohort %>% dplyr::select(-"rowId") %>% dplyr::rename(rowId = "originalRowId") - - attr(cohort, "metaData")$modelType <- type - + + attr(cohort, "metaData")$modelType <- type + return(cohort) } checkPySettings <- function(settings) { - checkIsClass(settings$seed, c('numeric', 'integer')) - ParallelLogger::logDebug(paste0('classifier seed: ', settings$seed)) - - checkIsClass(settings$requiresDenseMatrix, c('logical')) - ParallelLogger::logDebug(paste0('requiresDenseMatrix: ', settings$requiresDenseMatrix)) - - checkIsClass(settings$name, c('character')) - ParallelLogger::logDebug(paste0('name: ', settings$name)) - - checkIsClass(settings$pythonModule, c('character')) - ParallelLogger::logDebug(paste0('pythonModule: ', settings$pythonModule)) - - checkIsClass(settings$pythonClass, c('character')) - ParallelLogger::logDebug(paste0('pythonClass: ', settings$pythonClass)) + checkIsClass(settings$seed, c("numeric", "integer")) + ParallelLogger::logDebug(paste0("classifier seed: ", settings$seed)) + + checkIsClass(settings$requiresDenseMatrix, c("logical")) + ParallelLogger::logDebug(paste0("requiresDenseMatrix: ", settings$requiresDenseMatrix)) + + checkIsClass(settings$name, c("character")) + ParallelLogger::logDebug(paste0("name: ", settings$name)) + + checkIsClass(settings$pythonModule, c("character")) + ParallelLogger::logDebug(paste0("pythonModule: ", settings$pythonModule)) + + checkIsClass(settings$pythonClass, c("character")) + ParallelLogger::logDebug(paste0("pythonClass: ", settings$pythonClass)) } gridCvPython <- function(matrixData, @@ -235,53 +225,50 @@ gridCvPython <- function(matrixData, pythonClass, modelLocation, paramSearch, - saveToJson) -{ + saveToJson) { ParallelLogger::logInfo(paste0("Running CV for ", modelName, " model")) - - np <- reticulate::import('numpy') - os <- reticulate::import('os') - sys <- reticulate::import('sys') - math <- reticulate::import('math') - scipy <- reticulate::import('scipy') - joblib <- reticulate::import('joblib') - + + np <- reticulate::import("numpy") + joblib <- reticulate::import("joblib") + module <- reticulate::import(pythonModule, convert = FALSE) classifier <- module[pythonClass] - + ########################################################################### - + gridSearchPredictons <- list() length(gridSearchPredictons) <- length(paramSearch) - + for (gridId in 1:length(paramSearch)) { # initiate prediction prediction <- c() - + fold <- labels$index - ParallelLogger::logInfo(paste0('Max fold: ', max(fold))) - + ParallelLogger::logInfo(paste0("Max fold: ", max(fold))) + for (i in 1:max(fold)) { - ParallelLogger::logInfo(paste0('Fold ', i)) + ParallelLogger::logInfo(paste0("Fold ", i)) trainY <- reticulate::r_to_py(labels$outcomeCount[fold != i]) trainX <- reticulate::r_to_py(matrixData[fold != i, ]) testX <- reticulate::r_to_py(matrixData[fold == i, ]) - + if (requiresDenseMatrix) { - ParallelLogger::logInfo('Converting sparse martix to dense matrix (CV)') + ParallelLogger::logInfo("Converting sparse martix to dense matrix (CV)") trainX <- trainX$toarray() testX <- testX$toarray() } - + model <- - fitPythonModel(classifier, - paramSearch[[gridId]], - seed, - trainX, - trainY, - np, - pythonClass) - + fitPythonModel( + classifier, + paramSearch[[gridId]], + seed, + trainX, + trainY, + np, + pythonClass + ) + ParallelLogger::logInfo("Calculating predictions on left out fold set...") prediction <- rbind( @@ -290,84 +277,95 @@ gridCvPython <- function(matrixData, model = model, data = testX, cohort = labels[fold == i, ], - type = 'binary' + type = "binary" ) ) - } - - gridSearchPredictons[[gridId]] <- list(prediction = prediction, - param = paramSearch[[gridId]]) + + gridSearchPredictons[[gridId]] <- list( + prediction = prediction, + param = paramSearch[[gridId]] + ) } - + # get best para (this could be modified to enable any metric instead of AUC, just need metric input in function) - + paramGridSearch <- lapply(gridSearchPredictons, function(x) { do.call(computeGridPerformance, x) - }) # cvAUCmean, cvAUC, param - + }) # cvAUCmean, cvAUC, param + optimalParamInd <- - which.max(unlist(lapply(paramGridSearch, function(x) - x$cvPerformance))) - + which.max(unlist(lapply(paramGridSearch, function(x) { + x$cvPerformance + }))) + finalParam <- paramGridSearch[[optimalParamInd]]$param - + cvPrediction <- gridSearchPredictons[[optimalParamInd]]$prediction - cvPrediction$evaluationType <- 'CV' - - ParallelLogger::logInfo('Training final model using optimal parameters') - + cvPrediction$evaluationType <- "CV" + + ParallelLogger::logInfo("Training final model using optimal parameters") + trainY <- reticulate::r_to_py(labels$outcomeCount) trainX <- reticulate::r_to_py(matrixData) - + if (requiresDenseMatrix) { - ParallelLogger::logInfo('Converting sparse martix to dense matrix (final model)') + ParallelLogger::logInfo("Converting sparse martix to dense matrix (final model)") trainX <- trainX$toarray() } - + model <- - fitPythonModel(classifier, - finalParam , - seed, - trainX, - trainY, - np, - pythonClass) - + fitPythonModel( + classifier, + finalParam, + seed, + trainX, + trainY, + np, + pythonClass + ) + ParallelLogger::logInfo("Calculating predictions on all train data...") prediction <- predictValues( model = model, data = trainX, cohort = labels, - type = 'binary' + type = "binary" ) - prediction$evaluationType <- 'Train' - - prediction <- rbind(prediction, - cvPrediction) - + prediction$evaluationType <- "Train" + + prediction <- rbind( + prediction, + cvPrediction + ) + # saving model if (!dir.exists(file.path(modelLocation))) { dir.create(file.path(modelLocation), recursive = T) } if (saveToJson) { - sklearnToJson(model = model, - path = file.path(modelLocation, "model.json")) - } else{ + sklearnToJson( + model = model, + path = file.path(modelLocation, "model.json") + ) + } else { joblib$dump(model, file.path(modelLocation, "model.pkl"), compress = T) } - + # feature importance variableImportance <- - tryCatch({ - reticulate::py_to_r(model$feature_importances_) - }, error = function(e) { - ParallelLogger::logInfo(e) - return(rep(1, ncol(matrixData))) - }) - + tryCatch( + { + reticulate::py_to_r(model$feature_importances_) + }, + error = function(e) { + ParallelLogger::logInfo(e) + return(rep(1, ncol(matrixData))) + } + ) + return( list( prediction = prediction, @@ -376,7 +374,6 @@ gridCvPython <- function(matrixData, variableImportance = variableImportance ) ) - } @@ -388,47 +385,49 @@ fitPythonModel <- trainY, np, pythonClass) { - ParallelLogger::logInfo(paste0('data X dim: ', trainX$shape[0], 'x', trainX$shape[1])) + ParallelLogger::logInfo(paste0("data X dim: ", trainX$shape[0], "x", trainX$shape[1])) ParallelLogger::logInfo(paste0( - 'data Y length: ', + "data Y length: ", np$shape(trainY)[[1]], - ' with ', + " with ", np$sum(trainY), - ' outcomes' + " outcomes" )) - + timeStart <- Sys.time() - + # print parameters # convert NULL to NA values paramString <- param for (ind in 1:length(paramString)) { if (is.null(paramString[[ind]])) { - paramString[[ind]] <- 'null' + paramString[[ind]] <- "null" } } ParallelLogger::logInfo(paste( names(param), unlist(paramString), - sep = ':', - collapse = ' ' + sep = ":", + collapse = " " )) - + if (!is.null(param)) { model <- - do.call(paste0(pythonClass, 'Inputs'), - list(classifier = classifier, param = param)) - } else{ + do.call( + paste0(pythonClass, "Inputs"), + list(classifier = classifier, param = param) + ) + } else { model <- classifier() } model <- model$fit(trainX, trainY) timeEnd <- Sys.time() - + ParallelLogger::logInfo(paste0( "Training model took (mins): ", - difftime(timeEnd, timeStart, units = 'mins') + difftime(timeEnd, timeStart, units = "mins") )) - + return(model) } @@ -443,36 +442,40 @@ fitPythonModel <- #' @return A list with overview of the performance #' @export computeGridPerformance <- - function(prediction, param, performanceFunct = 'computeAuc') { - performance <- do.call(what = eval(parse(text = performanceFunct)), - args = list(prediction = prediction)) - + function(prediction, param, performanceFunct = "computeAuc") { + performance <- do.call( + what = eval(parse(text = performanceFunct)), + args = list(prediction = prediction) + ) + performanceFold <- c() for (i in 1:max(prediction$index)) { - performanceFold <- c(performanceFold, - do.call( - what = eval(parse(text = performanceFunct)), - args = list(prediction = prediction[prediction$index == i, ]) - )) + performanceFold <- c( + performanceFold, + do.call( + what = eval(parse(text = performanceFunct)), + args = list(prediction = prediction[prediction$index == i, ]) + ) + ) } - + paramString <- param for (ind in 1:length(paramString)) { if (is.null(paramString[[ind]])) { - paramString[[ind]] <- 'null' + paramString[[ind]] <- "null" } } - - #hyperSummary <- c(performanceFunct, performance, performanceFold, unlist(paramString)) - #names(hyperSummary) <- c( + + # hyperSummary <- c(performanceFunct, performance, performanceFold, unlist(paramString)) + # names(hyperSummary) <- c( # 'Metric', # 'cvPerformance', # paste0('cvPerformanceFold',1:length(performanceFold)), # names(param) - #) + # ) paramValues <- unlist(paramString) names(paramValues) <- names(param) - + hyperSummary <- as.data.frame(c( data.frame( metric = performanceFunct, diff --git a/R/SklearnClassifierHelpers.R b/R/SklearnClassifierHelpers.R index 0535d833..b844242b 100644 --- a/R/SklearnClassifierHelpers.R +++ b/R/SklearnClassifierHelpers.R @@ -29,4 +29,4 @@ listCartesian <- function(allList) { function(i) lapply(combinations, function(x) x[i][[1]])) return(results) } - + diff --git a/R/SklearnClassifierSettings.R b/R/SklearnClassifierSettings.R index 1b6446b7..8f29ba3c 100644 --- a/R/SklearnClassifierSettings.R +++ b/R/SklearnClassifierSettings.R @@ -20,90 +20,96 @@ #' @param nEstimators (list) The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early. #' @param learningRate (list) Weight applied to each classifier at each boosting iteration. A higher learning rate increases the contribution of each classifier. There is a trade-off between the learningRate and nEstimators parameters #' There is a trade-off between learningRate and nEstimators. -#' @param algorithm (list) If ‘SAMME.R’ then use the SAMME.R real boosting algorithm. base_estimator must support calculation of class probabilities. If ‘SAMME’ then use the SAMME discrete boosting algorithm. The SAMME.R algorithm typically converges faster than SAMME, achieving a lower test error with fewer boosting iterations. +#' @param algorithm Only ‘SAMME’ can be provided. The 'algorithm' argument will be deprecated in scikit-learn 1.8. #' @param seed A seed for the model #' #' @examples #' \dontrun{ -#' model.adaBoost <- setAdaBoost(nEstimators = list(10,50,200), learningRate = list(1, 0.5, 0.1), -#' algorithm = list('SAMME.R'), seed = sample(1000000,1) -#' ) +#' model.adaBoost <- setAdaBoost( +#' nEstimators = list(10, 50, 200), learningRate = list(1, 0.5, 0.1), +#' algorithm = list("SAMME.R"), seed = sample(1000000, 1) +#' ) #' } #' @export setAdaBoost <- function(nEstimators = list(10, 50, 200), learningRate = list(1, 0.5, 0.1), - algorithm = list('SAMME.R'), + algorithm = list("SAMME"), seed = sample(1000000, 1)) { checkIsClass(seed[[1]], c("numeric", "integer")) - checkIsClass(nEstimators, 'list') - checkIsClass(learningRate, 'list') - checkIsClass(algorithm, 'list') - - lapply(1:length(nEstimators), function(i) - checkIsClass(nEstimators[[i]] , c("integer", "numeric"))) - lapply(1:length(nEstimators), function(i) - checkHigher(nEstimators[[i]] , 0)) - + checkIsClass(nEstimators, "list") + checkIsClass(learningRate, "list") + checkIsClass(algorithm, "list") + + lapply(1:length(nEstimators), function(i) { + checkIsClass(nEstimators[[i]], c("integer", "numeric")) + }) + lapply(1:length(nEstimators), function(i) { + checkHigher(nEstimators[[i]], 0) + }) + for (i in 1:length(nEstimators)) { if (inherits(x = nEstimators[[i]], what = c("numeric", "integer"))) { nEstimators[[i]] <- as.integer(nEstimators[[i]]) } } - - lapply(1:length(learningRate), function(i) - checkIsClass(learningRate[[i]] , c("numeric"))) - lapply(1:length(learningRate), function(i) - checkHigher(learningRate[[i]] , 0)) - - lapply(1:length(algorithm), function(i) - checkIsClass(algorithm[[i]] , c("character"))) - - # test python is available and the required dependancies are there: - ##checkPython() - + + lapply(1:length(learningRate), function(i) { + checkIsClass(learningRate[[i]], c("numeric")) + }) + lapply(1:length(learningRate), function(i) { + checkHigher(learningRate[[i]], 0) + }) + + lapply(1:length(algorithm), function(i) { + checkIsClass(algorithm[[i]], c("character")) + checkIsEqual(algorithm[[i]], "SAMME") + }) + paramGrid <- list( nEstimators = nEstimators, learningRate = learningRate, algorithm = algorithm, seed = list(as.integer(seed[[1]])) ) - + param <- listCartesian(paramGrid) - - attr(param, 'settings') <- list( - modelType = 'adaBoost', + + attr(param, "settings") <- list( + modelType = "adaBoost", seed = seed[[1]], paramNames = names(paramGrid), - #use this for logging params - requiresDenseMatrix = F, + # use this for logging params + requiresDenseMatrix = FALSE, name = "AdaBoost", pythonModule = "sklearn.ensemble", pythonClass = "AdaBoostClassifier" ) - - attr(param, 'saveToJson') <- T - attr(param, 'saveType') <- 'file' - - result <- list(fitFunction = "fitSklearn", - param = param) + + attr(param, "saveToJson") <- TRUE + attr(param, "saveType") <- "file" + + result <- list( + fitFunction = "fitSklearn", + param = param + ) class(result) <- "modelSettings" - + return(result) } AdaBoostClassifierInputs <- function(classifier, param) { model <- classifier( - n_estimators = param[[which.max(names(param) == 'nEstimators')]], - learning_rate = param[[which.max(names(param) == 'learningRate')]], - algorithm = param[[which.max(names(param) == 'algorithm')]], - random_state = param[[which.max(names(param) == 'seed')]] + n_estimators = param[[which.max(names(param) == "nEstimators")]], + learning_rate = param[[which.max(names(param) == "learningRate")]], + algorithm = param[[which.max(names(param) == "algorithm")]], + random_state = param[[which.max(names(param) == "seed")]] ) - + return(model) } -#' Create setting for the scikit-learn 1.0.1 DecisionTree with python +#' Create setting for the scikit-learn DecisionTree with python #' @param criterion The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. #' @param splitter The strategy used to choose the split at each node. Supported strategies are “best” to choose the best split and “random” to choose the best random split. #' @param maxDepth (list) The maximum depth of the tree. If NULL, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. @@ -118,144 +124,174 @@ AdaBoostClassifierInputs <- function(classifier, param) { #' #' @examples #' \dontrun{ -#' model.decisionTree <- setDecisionTree(maxDepth=10,minSamplesLeaf=10, seed=NULL ) +#' model.decisionTree <- setDecisionTree(maxDepth = 10, minSamplesLeaf = 10, seed = NULL) #' } #' @export -setDecisionTree <- function(criterion = list('gini'), - splitter = list('best'), +setDecisionTree <- function(criterion = list("gini"), + splitter = list("best"), maxDepth = list(as.integer(4), as.integer(10), NULL), minSamplesSplit = list(2, 10), minSamplesLeaf = list(10, 50), minWeightFractionLeaf = list(0), - maxFeatures = list(100, 'sqrt', NULL), + maxFeatures = list(100, "sqrt", NULL), maxLeafNodes = list(NULL), - minImpurityDecrease = list(10 ^ -7), + minImpurityDecrease = list(10^-7), classWeight = list(NULL), seed = sample(1000000, 1)) { - if (!inherits(x = seed[[1]], what = c('numeric', 'integer'))) { - stop('Invalid seed') + if (!inherits(x = seed[[1]], what = c("numeric", "integer"))) { + stop("Invalid seed") } - - checkIsClass(criterion, 'list') - checkIsClass(splitter, 'list') - checkIsClass(maxDepth, 'list') - checkIsClass(minSamplesSplit, 'list') - checkIsClass(minSamplesLeaf, 'list') - checkIsClass(minWeightFractionLeaf, 'list') - checkIsClass(maxFeatures, 'list') - checkIsClass(maxLeafNodes, 'list') - checkIsClass(minImpurityDecrease, 'list') - checkIsClass(classWeight, 'list') - - lapply(1:length(criterion), function(i) - checkIsClass(criterion[[i]] , 'character')) - lapply(1:length(splitter), function(i) - checkIsClass(splitter[[i]] , 'character')) - - - lapply(1:length(criterion), - function(i) { - if (!criterion[[i]] %in% c('gini', 'entropy')) { - stop('Incorrect criterion') - } - }) - - - lapply(1:length(maxDepth), function(i) - checkIsClass(maxDepth[[i]] , c("numeric", "integer", "NULL"))) - lapply(1:length(maxDepth), function(i) - checkHigher(ifelse(is.null(maxDepth[[i]]), 1, maxDepth[[i]]) , 0)) + + checkIsClass(criterion, "list") + checkIsClass(splitter, "list") + checkIsClass(maxDepth, "list") + checkIsClass(minSamplesSplit, "list") + checkIsClass(minSamplesLeaf, "list") + checkIsClass(minWeightFractionLeaf, "list") + checkIsClass(maxFeatures, "list") + checkIsClass(maxLeafNodes, "list") + checkIsClass(minImpurityDecrease, "list") + checkIsClass(classWeight, "list") + + lapply(1:length(criterion), function(i) { + checkIsClass(criterion[[i]], "character") + }) + lapply(1:length(splitter), function(i) { + checkIsClass(splitter[[i]], "character") + }) + + + lapply( + 1:length(criterion), + function(i) { + if (!criterion[[i]] %in% c("gini", "entropy")) { + stop("Incorrect criterion") + } + } + ) + + + lapply(1:length(maxDepth), function(i) { + checkIsClass(maxDepth[[i]], c("numeric", "integer", "NULL")) + }) + lapply(1:length(maxDepth), function(i) { + checkHigher(ifelse(is.null(maxDepth[[i]]), 1, maxDepth[[i]]), 0) + }) for (i in 1:length(maxDepth)) { - if (inherits(x = maxDepth[[i]], what = c("numeric", "integer"))) { + if (inherits(x = maxDepth[[i]], what = c("numeric", "integer"))) { maxDepth[[i]] <- as.integer(maxDepth[[i]]) } } - - lapply(1:length(minSamplesSplit), - function(i) - checkIsClass(minSamplesSplit[[i]] , c("numeric", "integer", "NULL"))) - lapply(1:length(minSamplesSplit), - function(i) - checkHigher(ifelse( - is.null(minSamplesSplit[[i]]), 1, minSamplesSplit[[i]] - ) , 0)) - + + lapply( + 1:length(minSamplesSplit), + function(i) { + checkIsClass(minSamplesSplit[[i]], c("numeric", "integer", "NULL")) + } + ) + lapply( + 1:length(minSamplesSplit), + function(i) { + checkHigher(ifelse( + is.null(minSamplesSplit[[i]]), 1, minSamplesSplit[[i]] + ), 0) + } + ) + # convert to integer if >= 1 for (i in 1:length(minSamplesSplit)) { if (minSamplesSplit[[i]] >= 1) { minSamplesSplit[[i]] <- as.integer(minSamplesSplit[[i]]) } } - - - lapply(1:length(minSamplesLeaf), - function(i) - checkIsClass(minSamplesLeaf[[i]] , c("numeric", "integer"))) - lapply(1:length(minSamplesLeaf), - function(i) - checkHigher(minSamplesLeaf[[i]] , 0)) - + + + lapply( + 1:length(minSamplesLeaf), + function(i) { + checkIsClass(minSamplesLeaf[[i]], c("numeric", "integer")) + } + ) + lapply( + 1:length(minSamplesLeaf), + function(i) { + checkHigher(minSamplesLeaf[[i]], 0) + } + ) + # convert to integer if >= 1 for (i in 1:length(minSamplesLeaf)) { if (minSamplesLeaf[[i]] >= 1) { minSamplesLeaf[[i]] <- as.integer(minSamplesLeaf[[i]]) } } - - lapply(1:length(minWeightFractionLeaf), - function(i) - checkIsClass(minWeightFractionLeaf[[i]] , c("numeric"))) - lapply(1:length(minWeightFractionLeaf), - function(i) - checkHigherEqual(minWeightFractionLeaf[[i]] , 0)) - - lapply(1:length(maxFeatures), - function(i) - checkIsClass(maxFeatures[[i]] , c( - "numeric", "integer", "character", "NULL" - ))) - + + lapply( + 1:length(minWeightFractionLeaf), + function(i) { + checkIsClass(minWeightFractionLeaf[[i]], c("numeric")) + } + ) + lapply( + 1:length(minWeightFractionLeaf), + function(i) { + checkHigherEqual(minWeightFractionLeaf[[i]], 0) + } + ) + + lapply( + 1:length(maxFeatures), + function(i) { + checkIsClass(maxFeatures[[i]], c( + "numeric", "integer", "character", "NULL" + )) + } + ) + for (i in 1:length(maxFeatures)) { - if (inherits(x = maxFeatures[[i]], what = c("numeric", "integer"))) { + if (inherits(x = maxFeatures[[i]], what = c("numeric", "integer"))) { maxFeatures[[i]] <- as.integer(maxFeatures[[i]]) } } - - lapply(1:length(maxLeafNodes), - function(i) - checkIsClass(maxLeafNodes[[i]], c("integer", "NULL"))) - lapply(1:length(maxLeafNodes), - function(i) - checkHigher(ifelse( - is.null(maxLeafNodes[[i]]), 1, maxLeafNodes[[i]] - ) , 0)) - + + lapply( + 1:length(maxLeafNodes), + function(i) { + checkIsClass(maxLeafNodes[[i]], c("integer", "NULL")) + } + ) + lapply( + 1:length(maxLeafNodes), + function(i) { + checkHigher(ifelse( + is.null(maxLeafNodes[[i]]), 1, maxLeafNodes[[i]] + ), 0) + } + ) + for (i in 1:length(maxLeafNodes)) { - if (inherits(x = maxLeafNodes[[i]], what = c("numeric", "integer"))) { + if (inherits(x = maxLeafNodes[[i]], what = c("numeric", "integer"))) { maxLeafNodes[[i]] <- as.integer(maxLeafNodes[[i]]) } } - - lapply(1:length(minImpurityDecrease), - function(i) - checkIsClass(minImpurityDecrease[[i]] , c("numeric"))) - lapply(1:length(minImpurityDecrease), - function(i) - checkHigherEqual(minImpurityDecrease[[i]], 0)) - - lapply(1:length(classWeight), function(i) - checkIsClass(classWeight[[i]] , c('character', 'NULL'))) - - # test python is available and the required dependancies are there: - ##checkPython() - - # scikit-learn 1.0.1 inputs: - # criterion='gini', splitter='best', max_depth=None, min_samples_split=2, - # min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, - # max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0 - - # must be correct order for python classifier as I can't find a way to do.call a named list - # using reticulate + + lapply( + 1:length(minImpurityDecrease), + function(i) { + checkIsClass(minImpurityDecrease[[i]], c("numeric")) + } + ) + lapply( + 1:length(minImpurityDecrease), + function(i) { + checkHigherEqual(minImpurityDecrease[[i]], 0) + } + ) + + lapply(1:length(classWeight), function(i) { + checkIsClass(classWeight[[i]], c("character", "NULL")) + }) + paramGrid <- list( criterion = criterion, splitter = splitter, @@ -270,44 +306,45 @@ setDecisionTree <- function(criterion = list('gini'), classWeight = classWeight ) param <- listCartesian(paramGrid) - - attr(param, 'settings') <- list( - modelType = 'decisionTree', + + attr(param, "settings") <- list( + modelType = "decisionTree", seed = seed[[1]], paramNames = names(paramGrid), - #use this for logging params - requiresDenseMatrix = F, + requiresDenseMatrix = FALSE, name = "Decision Tree", pythonModule = "sklearn.tree", pythonClass = "DecisionTreeClassifier" ) - - attr(param, 'saveToJson') <- T - attr(param, 'saveType') <- 'file' - - result <- list(fitFunction = "fitSklearn", - param = param) + + attr(param, "saveToJson") <- TRUE + attr(param, "saveType") <- "file" + + result <- list( + fitFunction = "fitSklearn", + param = param + ) class(result) <- "modelSettings" - + return(result) } DecisionTreeClassifierInputs <- function(classifier, param) { model <- classifier( - criterion = param[[which.max(names(param) == 'criterion')]], - splitter = param[[which.max(names(param) == 'splitter')]], - max_depth = param[[which.max(names(param) == 'maxDepth')]], - min_samples_split = param[[which.max(names(param) == 'minSamplesSplit')]], - min_samples_leaf = param[[which.max(names(param) == 'minSamplesLeaf')]], - min_weight_fraction_leaf = param[[which.max(names(param) == 'minWeightFractionLeaf')]], - max_features = param[[which.max(names(param) == 'maxFeatures')]], - random_state = param[[which.max(names(param) == 'seed')]], - max_leaf_nodes = param[[which.max(names(param) == 'maxLeafNodes')]], - min_impurity_decrease = param[[which.max(names(param) == 'minImpurityDecrease')]], - class_weight = param[[which.max(names(param) == 'classWeight')]] - ) - + criterion = param[[which.max(names(param) == "criterion")]], + splitter = param[[which.max(names(param) == "splitter")]], + max_depth = param[[which.max(names(param) == "maxDepth")]], + min_samples_split = param[[which.max(names(param) == "minSamplesSplit")]], + min_samples_leaf = param[[which.max(names(param) == "minSamplesLeaf")]], + min_weight_fraction_leaf = param[[which.max(names(param) == "minWeightFractionLeaf")]], + max_features = param[[which.max(names(param) == "maxFeatures")]], + random_state = param[[which.max(names(param) == "seed")]], + max_leaf_nodes = param[[which.max(names(param) == "maxLeafNodes")]], + min_impurity_decrease = param[[which.max(names(param) == "minImpurityDecrease")]], + class_weight = param[[which.max(names(param) == "classWeight")]] + ) + return(model) } @@ -348,12 +385,12 @@ DecisionTreeClassifierInputs <- function(classifier, param) { #' } #' @export setMLP <- function(hiddenLayerSizes = list(c(100), c(20)), - #must be integers - activation = list('relu'), - solver = list('adam'), + # must be integers + activation = list("relu"), + solver = list("adam"), alpha = list(0.3, 0.01, 0.0001, 0.000001), - batchSize = list('auto'), - learningRate = list('constant'), + batchSize = list("auto"), + learningRate = list("constant"), learningRateInit = list(0.001), powerT = list(0.5), maxIter = list(200, 100), @@ -369,52 +406,52 @@ setMLP <- function(hiddenLayerSizes = list(c(100), c(20)), epsilon = list(0.00000001), nIterNoChange = list(10), seed = sample(100000, 1)) { - checkIsClass(seed, c('numeric', 'integer')) - checkIsClass(hiddenLayerSizes, c('list')) - checkIsClass(activation, c('list')) - checkIsClass(solver, c('list')) - checkIsClass(alpha, c('list')) - checkIsClass(batchSize, c('list')) - checkIsClass(learningRate, c('list')) - checkIsClass(learningRateInit, c('list')) - checkIsClass(powerT, c('list')) - checkIsClass(maxIter, c('list')) - checkIsClass(shuffle, c('list')) - checkIsClass(tol, c('list')) - checkIsClass(warmStart, c('list')) - checkIsClass(momentum, c('list')) - checkIsClass(nesterovsMomentum, c('list')) - checkIsClass(earlyStopping, c('list')) - checkIsClass(validationFraction, c('list')) - checkIsClass(beta1, c('list')) - checkIsClass(beta2, c('list')) - checkIsClass(epsilon, c('list')) - checkIsClass(nIterNoChange, c('list')) - - + checkIsClass(seed, c("numeric", "integer")) + checkIsClass(hiddenLayerSizes, c("list")) + checkIsClass(activation, c("list")) + checkIsClass(solver, c("list")) + checkIsClass(alpha, c("list")) + checkIsClass(batchSize, c("list")) + checkIsClass(learningRate, c("list")) + checkIsClass(learningRateInit, c("list")) + checkIsClass(powerT, c("list")) + checkIsClass(maxIter, c("list")) + checkIsClass(shuffle, c("list")) + checkIsClass(tol, c("list")) + checkIsClass(warmStart, c("list")) + checkIsClass(momentum, c("list")) + checkIsClass(nesterovsMomentum, c("list")) + checkIsClass(earlyStopping, c("list")) + checkIsClass(validationFraction, c("list")) + checkIsClass(beta1, c("list")) + checkIsClass(beta2, c("list")) + checkIsClass(epsilon, c("list")) + checkIsClass(nIterNoChange, c("list")) + + for (i in 1:length(hiddenLayerSizes)) { hiddenLayerSizes[[i]] <- as.integer(hiddenLayerSizes[[i]]) } - - + + for (i in 1:length(batchSize)) { - if (inherits(x = batchSize[[i]], what = c("numeric", "integer"))) { + if (inherits(x = batchSize[[i]], what = c("numeric", "integer"))) { batchSize[[i]] <- as.integer(batchSize[[i]]) } } - + for (i in 1:length(maxIter)) { - if (inherits(x = maxIter[[i]], what = c("numeric", "integer"))) { + if (inherits(x = maxIter[[i]], what = c("numeric", "integer"))) { maxIter[[i]] <- as.integer(maxIter[[i]]) } } - + for (i in 1:length(nIterNoChange)) { - if (inherits(x = nIterNoChange[[i]], what = c("numeric", "integer"))) { + if (inherits(x = nIterNoChange[[i]], what = c("numeric", "integer"))) { nIterNoChange[[i]] <- as.integer(nIterNoChange[[i]]) } } - + # add lapply for values... paramGrid <- list( hiddenLayerSizes = hiddenLayerSizes, @@ -436,60 +473,62 @@ setMLP <- function(hiddenLayerSizes = list(c(100), c(20)), earlyStopping = earlyStopping, validationFraction = validationFraction, beta1 = beta1, - beta2 = beta2 , + beta2 = beta2, epsilon = epsilon, nIterNoChange = nIterNoChange ) - + param <- listCartesian(paramGrid) - - attr(param, 'settings') <- list( - modelType = 'mlp', + + attr(param, "settings") <- list( + modelType = "mlp", seed = seed[[1]], paramNames = names(paramGrid), - #use this for logging params - requiresDenseMatrix = F, + # use this for logging params + requiresDenseMatrix = FALSE, name = "Neural Network", pythonModule = "sklearn.neural_network", pythonClass = "MLPClassifier" ) - - attr(param, 'saveToJson') <- T - attr(param, 'saveType') <- 'file' - - result <- list(fitFunction = "fitSklearn", - param = param) + + attr(param, "saveToJson") <- TRUE + attr(param, "saveType") <- "file" + + result <- list( + fitFunction = "fitSklearn", + param = param + ) class(result) <- "modelSettings" - + return(result) } MLPClassifierInputs <- function(classifier, param) { model <- classifier( - hidden_layer_sizes = param[[which.max(names(param) == 'hiddenLayerSizes')]], - activation = param[[which.max(names(param) == 'activation')]], - solver = param[[which.max(names(param) == 'solver')]], - alpha = param[[which.max(names(param) == 'alpha')]], - batch_size = param[[which.max(names(param) == 'batchSize')]], - learning_rate = param[[which.max(names(param) == 'learningRate')]], - learning_rate_init = param[[which.max(names(param) == 'learningRateInit')]], - power_t = param[[which.max(names(param) == 'powerT')]], - max_iter = param[[which.max(names(param) == 'maxIter')]], - shuffle = param[[which.max(names(param) == 'shuffle')]], - random_state = param[[which.max(names(param) == 'seed')]], - tol = param[[which.max(names(param) == 'tol')]], - verbose = F, - warm_start = param[[which.max(names(param) == 'warmStart')]], - momentum = param[[which.max(names(param) == 'momentum')]], - nesterovs_momentum = param[[which.max(names(param) == 'nesterovsMomentum')]], - early_stopping = param[[which.max(names(param) == 'earlyStopping')]], - validation_fraction = param[[which.max(names(param) == 'validationFraction')]], - beta_1 = param[[which.max(names(param) == 'beta1')]], - beta_2 = param[[which.max(names(param) == 'beta2')]], - epsilon = param[[which.max(names(param) == 'epsilon')]], - n_iter_no_change = param[[which.max(names(param) == 'nIterNoChange')]] - ) - + hidden_layer_sizes = param[[which.max(names(param) == "hiddenLayerSizes")]], + activation = param[[which.max(names(param) == "activation")]], + solver = param[[which.max(names(param) == "solver")]], + alpha = param[[which.max(names(param) == "alpha")]], + batch_size = param[[which.max(names(param) == "batchSize")]], + learning_rate = param[[which.max(names(param) == "learningRate")]], + learning_rate_init = param[[which.max(names(param) == "learningRateInit")]], + power_t = param[[which.max(names(param) == "powerT")]], + max_iter = param[[which.max(names(param) == "maxIter")]], + shuffle = param[[which.max(names(param) == "shuffle")]], + random_state = param[[which.max(names(param) == "seed")]], + tol = param[[which.max(names(param) == "tol")]], + verbose = FALSE, + warm_start = param[[which.max(names(param) == "warmStart")]], + momentum = param[[which.max(names(param) == "momentum")]], + nesterovs_momentum = param[[which.max(names(param) == "nesterovsMomentum")]], + early_stopping = param[[which.max(names(param) == "earlyStopping")]], + validation_fraction = param[[which.max(names(param) == "validationFraction")]], + beta_1 = param[[which.max(names(param) == "beta1")]], + beta_2 = param[[which.max(names(param) == "beta2")]], + epsilon = param[[which.max(names(param) == "epsilon")]], + n_iter_no_change = param[[which.max(names(param) == "nIterNoChange")]] + ) + return(model) } @@ -503,35 +542,35 @@ MLPClassifierInputs <- function(classifier, param) { #' } #' @export setNaiveBayes <- function() { - # test python is available and the required dependancies are there: - ##checkPython() - - param <- list(none = 'true') - - attr(param, 'settings') <- list( - modelType = 'naiveBayes', + param <- list(none = "true") + + attr(param, "settings") <- list( + modelType = "naiveBayes", seed = as.integer(0), paramNames = c(), - #use this for logging params - requiresDenseMatrix = T, + # use this for logging params + requiresDenseMatrix = TRUE, name = "Naive Bayes", pythonModule = "sklearn.naive_bayes", pythonClass = "GaussianNB" ) - - attr(param, 'saveToJson') <- T - attr(param, 'saveType') <- 'file' - - result <- list(fitFunction = "fitSklearn", - param = param) + + attr(param, "saveToJson") <- TRUE + attr(param, "saveType") <- "file" + + result <- list( + + fitFunction = "fitSklearn", + param = param + ) class(result) <- "modelSettings" - + return(result) } GaussianNBInputs <- function(classifier, param) { model <- classifier() - + return(model) } @@ -563,17 +602,19 @@ GaussianNBInputs <- function(classifier, param) { #' #' @examples #' \dontrun{ -#' model.rf <- setRandomForest(mtries=list('auto',5,20), ntrees=c(10,100), -#' maxDepth=c(5,20)) +#' model.rf <- setRandomForest( +#' mtries = list("auto", 5, 20), ntrees = c(10, 100), +#' maxDepth = c(5, 20) +#' ) #' } #' @export -setRandomForest <- function(ntrees = list(100, 500), - criterion = list('gini'), +setRandomForest <- function(ntrees = list(100, 500), + criterion = list("gini"), maxDepth = list(4, 10, 17), minSamplesSplit = list(2, 5), minSamplesLeaf = list(1, 10), minWeightFractionLeaf = list(0), - mtries = list('sqrt', 'log2'), + mtries = list("sqrt", "log2"), maxLeafNodes = list(NULL), minImpurityDecrease = list(0), bootstrap = list(TRUE), @@ -582,22 +623,22 @@ setRandomForest <- function(ntrees = list(100, 500), nJobs = list(NULL), classWeight = list(NULL), seed = sample(100000, 1)) { - checkIsClass(seed, c('numeric', 'integer')) - checkIsClass(ntrees, c('list')) - checkIsClass(criterion, c('list')) - checkIsClass(maxDepth, c('list')) - checkIsClass(minSamplesSplit, c('list')) - checkIsClass(minSamplesLeaf, c('list')) - checkIsClass(minWeightFractionLeaf, c('list')) - checkIsClass(mtries, c('list')) - checkIsClass(maxLeafNodes, c('list')) - checkIsClass(minImpurityDecrease, c('list')) - checkIsClass(bootstrap, c('list')) - checkIsClass(maxSamples, c('list')) - checkIsClass(oobScore, c('list')) - checkIsClass(nJobs, c('list')) - checkIsClass(classWeight, c('list')) - + checkIsClass(seed, c("numeric", "integer")) + checkIsClass(ntrees, c("list")) + checkIsClass(criterion, c("list")) + checkIsClass(maxDepth, c("list")) + checkIsClass(minSamplesSplit, c("list")) + checkIsClass(minSamplesLeaf, c("list")) + checkIsClass(minWeightFractionLeaf, c("list")) + checkIsClass(mtries, c("list")) + checkIsClass(maxLeafNodes, c("list")) + checkIsClass(minImpurityDecrease, c("list")) + checkIsClass(bootstrap, c("list")) + checkIsClass(maxSamples, c("list")) + checkIsClass(oobScore, c("list")) + checkIsClass(nJobs, c("list")) + checkIsClass(classWeight, c("list")) + # convert to integer when needed for (i in 1:length(ntrees)) { if (inherits(x = ntrees[[i]], what = c("numeric", "integer"))) { @@ -605,46 +646,46 @@ setRandomForest <- function(ntrees = list(100, 500), } } for (i in 1:length(maxDepth)) { - if (inherits(x = maxDepth[[i]], what = c("numeric", "integer"))) { + if (inherits(x = maxDepth[[i]], what = c("numeric", "integer"))) { maxDepth[[i]] <- as.integer(maxDepth[[i]]) } } - + for (i in 1:length(minSamplesSplit)) { if (minSamplesSplit[[i]] >= 1) { minSamplesSplit[[i]] <- as.integer(minSamplesSplit[[i]]) } } - + for (i in 1:length(minSamplesLeaf)) { if (minSamplesLeaf[[i]] >= 1) { minSamplesLeaf[[i]] <- as.integer(minSamplesLeaf[[i]]) } } - + for (i in 1:length(maxLeafNodes)) { - if (inherits(x = maxLeafNodes[[i]], what = c("numeric", "integer"))) { + if (inherits(x = maxLeafNodes[[i]], what = c("numeric", "integer"))) { maxLeafNodes[[i]] <- as.integer(maxLeafNodes[[i]]) } } - + for (i in 1:length(nJobs)) { - if (inherits(x = nJobs[[i]], what = c("numeric", "integer"))) { + if (inherits(x = nJobs[[i]], what = c("numeric", "integer"))) { nJobs[[i]] <- as.integer(nJobs[[i]]) } } - + for (i in 1:length(maxSamples)) { - if (inherits(x = maxSamples[[i]], what = c("numeric", "integer"))) { + if (inherits(x = maxSamples[[i]], what = c("numeric", "integer"))) { if (maxSamples[[i]] >= 1) { maxSamples[[i]] <- as.integer(maxSamples[[i]]) } } } - + # add value checks - paramGrid = list( - ntrees = ntrees, + paramGrid <- list( + ntrees = ntrees, criterion = criterion, maxDepth = maxDepth, minSamplesSplit = minSamplesSplit, @@ -661,50 +702,52 @@ setRandomForest <- function(ntrees = list(100, 500), maxSamples = maxSamples ) param <- listCartesian(paramGrid) - - attr(param, 'settings') <- list( - modelType = 'randomForest', + + attr(param, "settings") <- list( + modelType = "randomForest", seed = seed[[1]], paramNames = names(paramGrid), - #use this for logging params - requiresDenseMatrix = F, + # use this for logging params + requiresDenseMatrix = FALSE, name = "Random forest", pythonModule = "sklearn.ensemble", pythonClass = "RandomForestClassifier" ) - - attr(param, 'saveToJson') <- T - attr(param, 'saveType') <- 'file' - - result <- list(fitFunction = "fitSklearn", - param = param) + + attr(param, "saveToJson") <- TRUE + attr(param, "saveType") <- "file" + + result <- list( + fitFunction = "fitSklearn", + param = param + ) class(result) <- "modelSettings" - + return(result) } RandomForestClassifierInputs <- function(classifier, param) { model <- classifier( - n_estimators = param[[which.max(names(param) == 'ntrees')]], - criterion = param[[which.max(names(param) == 'criterion')]], - max_depth = param[[which.max(names(param) == 'maxDepth')]], - min_samples_split = param[[which.max(names(param) == 'minSamplesSplit')]], - min_samples_leaf = param[[which.max(names(param) == 'minSamplesLeaf')]], - min_weight_fraction_leaf = param[[which.max(names(param) == 'minWeightFractionLeaf')]], - max_features = param[[which.max(names(param) == 'mtries')]], - max_leaf_nodes = param[[which.max(names(param) == 'maxLeafNodes')]], - min_impurity_decrease = param[[which.max(names(param) == 'minImpurityDecrease')]], - bootstrap = param[[which.max(names(param) == 'bootstrap')]], - max_samples = param[[which.max(names(param) == 'maxSamples')]], - oob_score = param[[which.max(names(param) == 'oobScore')]], - n_jobs = param[[which.max(names(param) == 'nJobs')]], - random_state = param[[which.max(names(param) == 'seed')]], + n_estimators = param[[which.max(names(param) == "ntrees")]], + criterion = param[[which.max(names(param) == "criterion")]], + max_depth = param[[which.max(names(param) == "maxDepth")]], + min_samples_split = param[[which.max(names(param) == "minSamplesSplit")]], + min_samples_leaf = param[[which.max(names(param) == "minSamplesLeaf")]], + min_weight_fraction_leaf = param[[which.max(names(param) == "minWeightFractionLeaf")]], + max_features = param[[which.max(names(param) == "mtries")]], + max_leaf_nodes = param[[which.max(names(param) == "maxLeafNodes")]], + min_impurity_decrease = param[[which.max(names(param) == "minImpurityDecrease")]], + bootstrap = param[[which.max(names(param) == "bootstrap")]], + max_samples = param[[which.max(names(param) == "maxSamples")]], + oob_score = param[[which.max(names(param) == "oobScore")]], + n_jobs = param[[which.max(names(param) == "nJobs")]], + random_state = param[[which.max(names(param) == "seed")]], verbose = 0L, - warm_start = F, - class_weight = param[[which.max(names(param) == 'classWeight')]] + warm_start = FALSE, + class_weight = param[[which.max(names(param) == "classWeight")]] ) - + return(model) } @@ -724,37 +767,37 @@ RandomForestClassifierInputs <- function(classifier, param) { #' #' @examples #' \dontrun{ -#' model.svm <- setSVM(kernel='rbf', seed = NULL) +#' model.svm <- setSVM(kernel = "rbf", seed = NULL) #' } #' @export setSVM <- function(C = list(1, 0.9, 2, 0.1), - kernel = list('rbf'), + kernel = list("rbf"), degree = list(1, 3, 5), - gamma = list('scale', 1e-04, 3e-05, 0.001, 0.01, 0.25), + gamma = list("scale", 1e-04, 3e-05, 0.001, 0.01, 0.25), coef0 = list(0.0), shrinking = list(TRUE), tol = list(0.001), classWeight = list(NULL), - cacheSize = 500, + cacheSize = 500, seed = sample(100000, 1)) { - checkIsClass(seed, c('numeric', 'integer')) - checkIsClass(cacheSize, c('numeric', 'integer')) - checkIsClass(C, c('list')) - checkIsClass(kernel, c('list')) - checkIsClass(degree, c('list')) - checkIsClass(gamma, c('list')) - checkIsClass(coef0, c('list')) - checkIsClass(shrinking, c('list')) - checkIsClass(tol, c('list')) - checkIsClass(classWeight, c('list')) - + checkIsClass(seed, c("numeric", "integer")) + checkIsClass(cacheSize, c("numeric", "integer")) + checkIsClass(C, c("list")) + checkIsClass(kernel, c("list")) + checkIsClass(degree, c("list")) + checkIsClass(gamma, c("list")) + checkIsClass(coef0, c("list")) + checkIsClass(shrinking, c("list")) + checkIsClass(tol, c("list")) + checkIsClass(classWeight, c("list")) + for (i in 1:length(degree)) { - if (inherits(x = degree[[i]], what = c("numeric", "integer"))) { + if (inherits(x = degree[[i]], what = c("numeric", "integer"))) { degree[[i]] <- as.integer(degree[[i]]) } } - - paramGrid = list( + + paramGrid <- list( C = C, kernel = kernel, degree = degree, @@ -766,49 +809,51 @@ setSVM <- function(C = list(1, 0.9, 2, 0.1), classWeight = classWeight, seed = list(as.integer(seed[[1]])) ) - + param <- listCartesian(paramGrid) - - - attr(param, 'settings') <- list( - modelType = 'svm', + + + attr(param, "settings") <- list( + modelType = "svm", seed = seed[[1]], paramNames = names(paramGrid), - #use this for logging params - requiresDenseMatrix = F, + # use this for logging params + requiresDenseMatrix = FALSE, name = "Support Vector Machine", pythonModule = "sklearn.svm", pythonClass = "SVC" ) - - attr(param, 'saveToJson') <- T - attr(param, 'saveType') <- 'file' - - result <- list(fitFunction = "fitSklearn", - param = param) + + attr(param, "saveToJson") <- TRUE + attr(param, "saveType") <- "file" + + result <- list( + fitFunction = "fitSklearn", + param = param + ) class(result) <- "modelSettings" - + return(result) } SVCInputs <- function(classifier, param) { model <- classifier( - C = param[[which.max(names(param) == 'C')]], - kernel = param[[which.max(names(param) == 'kernel')]], - degree = param[[which.max(names(param) == 'degree')]], - gamma = param[[which.max(names(param) == 'gamma')]], - coef0 = param[[which.max(names(param) == 'coef0')]], - shrinking = param[[which.max(names(param) == 'shrinking')]], - probability = T, - tol = param[[which.max(names(param) == 'tol')]], - cache_size = param[[which.max(names(param) == 'cacheSize')]], - class_weight = param[[which.max(names(param) == 'classWeight')]], - verbose = F, + C = param[[which.max(names(param) == "C")]], + kernel = param[[which.max(names(param) == "kernel")]], + degree = param[[which.max(names(param) == "degree")]], + gamma = param[[which.max(names(param) == "gamma")]], + coef0 = param[[which.max(names(param) == "coef0")]], + shrinking = param[[which.max(names(param) == "shrinking")]], + probability = TRUE, + tol = param[[which.max(names(param) == "tol")]], + cache_size = param[[which.max(names(param) == "cacheSize")]], + class_weight = param[[which.max(names(param) == "classWeight")]], + verbose = FALSE, max_iter = as.integer(-1), - decision_function_shape = 'ovr', - break_ties = F, - random_state = param[[which.max(names(param) == 'seed')]] + decision_function_shape = "ovr", + break_ties = FALSE, + random_state = param[[which.max(names(param) == "seed")]] ) - + return(model) } diff --git a/R/SklearnToJson.R b/R/SklearnToJson.R index 2a277133..9ff3062b 100644 --- a/R/SklearnToJson.R +++ b/R/SklearnToJson.R @@ -21,8 +21,8 @@ #' @param path path to the saved model file #' @export sklearnToJson <- function(model, path) { - py <- reticulate::import_builtins(convert=FALSE) - json <- reticulate::import("json", convert=FALSE) + py <- reticulate::import_builtins(convert = FALSE) + json <- reticulate::import("json", convert = FALSE) if (inherits(model, "sklearn.tree._classes.DecisionTreeClassifier")) { serializedModel <- serializeDecisionTree(model) } else if (inherits(model, "sklearn.ensemble._forest.RandomForestClassifier")) { @@ -32,27 +32,27 @@ sklearnToJson <- function(model, path) { } else if (inherits(model, "sklearn.naive_bayes.GaussianNB")) { serializedModel <- serializeNaiveBayes(model) } else if (inherits(model, "sklearn.neural_network._multilayer_perceptron.MLPClassifier")) { - serializedModel <- serializeMLP(model) - } else if (inherits(model, "sklearn.svm._classes.SVC" )) { + serializedModel <- serializeMLP(model) + } else if (inherits(model, "sklearn.svm._classes.SVC")) { serializedModel <- serializeSVM(model) } else { stop("Unsupported model") } - - with(py$open(path, "w"), as=file, { - json$dump(serializedModel, fp=file) + + with(py$open(path, "w"), as = file, { + json$dump(serializedModel, fp = file) }) return(invisible()) } -#' Loads sklearn python model from json +#' Loads sklearn python model from json #' @param path path to the model json file #' @export sklearnFromJson <- function(path) { - py <- reticulate::import_builtins(convert=FALSE) - json <- reticulate::import("json", convert=FALSE) - with(py$open(path, "r"), as=file, { - model <- json$load(fp=file) + py <- reticulate::import_builtins(convert = FALSE) + json <- reticulate::import("json", convert = FALSE) + with(py$open(path, "r"), as = file, { + model <- json$load(fp = file) }) if (reticulate::py_bool(model["meta"] == "decision-tree")) { model <- deSerializeDecisionTree(model) @@ -64,7 +64,7 @@ sklearnFromJson <- function(path) { model <- deSerializeNaiveBayes(model) } else if (reticulate::py_bool(model["meta"] == "mlp")) { model <- deSerializeMlp(model) - } else if (reticulate::py_bool(model["meta"] == "svm")) { + } else if (reticulate::py_bool(model["meta"] == "svm")) { model <- deSerializeSVM(model) } else { stop("Unsupported model") @@ -75,90 +75,100 @@ sklearnFromJson <- function(path) { serializeTree <- function(tree) { serializedTree <- tree$`__getstate__`() dtypes <- serializedTree["nodes"]$dtype - + serializedTree["nodes"] <- serializedTree["nodes"]$tolist() serializedTree["values"] <- serializedTree["values"]$tolist() - - return(list(serializedTree, dtypes)) + + return(list(serializedTree, dtypes)) } deSerializeTree <- function(tree_dict, nFeatures, nClasses, nOutputs) { # TODO the below only works for tree_dict loaded from json, if not it - for (i in 0:(length(tree_dict["nodes"])-1)) { - reticulate::py_set_item(tree_dict["nodes"], i, - reticulate::tuple(reticulate::py_to_r(tree_dict["nodes"][i]))) + for (i in 0:(length(tree_dict["nodes"]) - 1)) { + reticulate::py_set_item( + tree_dict["nodes"], i, + reticulate::tuple(reticulate::py_to_r(tree_dict["nodes"][i])) + ) } - + names <- list("left_child", "right_child", "feature", "threshold", "impurity", "n_node_samples", "weighted_n_node_samples") - if (length(tree_dict["nodes"][0])==8) { + if (length(tree_dict["nodes"][0]) == 8) { # model used sklearn>=1.3 which added a parameter names[[8]] <- "missing_go_to_left" } - + sklearn <- reticulate::import("sklearn") np <- reticulate::import("numpy", convert = FALSE) - - tree_dict["nodes"] <- np$array(tree_dict["nodes"], - dtype=np$dtype(reticulate::dict( - names = names, - formats = tree_dict["nodes_dtype"] - ))) + + tree_dict["nodes"] <- np$array(tree_dict["nodes"], + dtype = np$dtype(reticulate::dict( + names = names, + formats = tree_dict["nodes_dtype"] + )) + ) tree_dict["values"] <- np$array(tree_dict["values"]) - - Tree <- sklearn$tree$`_tree`$Tree(nFeatures, - np$array(reticulate::tuple(nClasses), - dtype=np$intp), - nOutputs) - + + Tree <- sklearn$tree$`_tree`$Tree( + nFeatures, + np$array(reticulate::tuple(nClasses), + dtype = np$intp + ), + nOutputs + ) + Tree$`__setstate__`(tree_dict) - + return(Tree) } serializeDecisionTree <- function(model) { - tree <- serializeTree(model$tree_) - dtypes <- tree[[2]] - tree <- tree[[1]] - py <- reticulate::import_builtins(convert=FALSE) - serialized_model <- reticulate::dict( - "meta" = "decision-tree", - "feature_importances_" = model$feature_importances_$tolist(), - "max_features_" = model$max_features_, - "n_classes_" = py$int(model$n_classes_), - "n_features_in_" = model$n_features_in_, - "n_outputs_" = model$n_outputs_, - "tree_" = tree, - "classes_" = model$classes_$tolist(), - "params" = model$get_params() - ) - - tree_dtypes <- list() - for (i in 0:(length(dtypes)-1)) { - tree_dtypes <- c(tree_dtypes, dtypes[[i]]$str) - } - - serialized_model["tree_"]["nodes_dtype"] <- tree_dtypes - return(serialized_model) + tree <- serializeTree(model$tree_) + dtypes <- tree[[2]] + tree <- tree[[1]] + py <- reticulate::import_builtins(convert = FALSE) + serialized_model <- reticulate::dict( + "meta" = "decision-tree", + "feature_importances_" = model$feature_importances_$tolist(), + "max_features_" = model$max_features_, + "n_classes_" = py$int(model$n_classes_), + "n_features_in_" = model$n_features_in_, + "n_outputs_" = model$n_outputs_, + "tree_" = tree, + "classes_" = model$classes_$tolist(), + "params" = model$get_params() + ) + + tree_dtypes <- list() + for (i in 0:(length(dtypes) - 1)) { + tree_dtypes <- c(tree_dtypes, dtypes[[i]]$str) + } + + serialized_model["tree_"]["nodes_dtype"] <- tree_dtypes + return(serialized_model) } deSerializeDecisionTree <- function(model_dict) { - np <- reticulate::import("numpy", convert=FALSE) - sklearn <- reticulate::import("sklearn", convert=FALSE) - deserialized_model <- do.call(sklearn$tree$DecisionTreeClassifier, - reticulate::py_to_r(model_dict["params"])) - + np <- reticulate::import("numpy", convert = FALSE) + sklearn <- reticulate::import("sklearn", convert = FALSE) + deserialized_model <- do.call( + sklearn$tree$DecisionTreeClassifier, + reticulate::py_to_r(model_dict["params"]) + ) + deserialized_model$classes_ <- np$array(model_dict["classes_"]) deserialized_model$max_features_ <- model_dict["max_features_"] deserialized_model$n_classes_ <- model_dict["n_classes_"] deserialized_model$n_features_in <- model_dict["n_features_in_"] deserialized_model$n_outputs_ <- model_dict["n_outputs_"] - - tree <- deSerializeTree(model_dict["tree_"], - model_dict["n_features_in_"], - model_dict["n_classes_"], - model_dict["n_outputs_"]) + + tree <- deSerializeTree( + model_dict["tree_"], + model_dict["n_features_in_"], + model_dict["n_classes_"], + model_dict["n_outputs_"] + ) deserialized_model$tree_ <- tree - + return(deserialized_model) } @@ -167,7 +177,7 @@ serializeRandomForest <- function(model) { for (i in 1:length(model$estimators_)) { estimators <- c(estimators, serializeDecisionTree(model$estimators_[i - 1])) } - + serialized_model <- reticulate::dict( "meta" = "rf", "max_depth" = model$max_depth, @@ -183,32 +193,35 @@ serializeRandomForest <- function(model) { "classes_" = model$classes_$tolist(), "estimators_" = reticulate::r_to_py(estimators), "params" = model$get_params(), - "n_classes_" = model$n_classes_) - - if (reticulate::py_bool(model$`__dict__`["oob_score_"] != reticulate::py_none())) { - serialized_model["oob_score_"] <- model$oob_score_ + "n_classes_" = model$n_classes_ + ) + + if (reticulate::py_bool(model$`__dict__`["oob_score_"] != reticulate::py_none())) { + serialized_model["oob_score_"] <- model$oob_score_ serialized_model["oob_decision_function_"] <- model$oob_decision_function_$tolist() } - + return(serialized_model) } - + deSerializeRandomForest <- function(model_dict) { - np <- reticulate::import("numpy", convert=FALSE) - sklearn <- reticulate::import("sklearn", convert=FALSE) - model <- do.call(sklearn$ensemble$RandomForestClassifier, - reticulate::py_to_r(model_dict["params"])) - + np <- reticulate::import("numpy", convert = FALSE) + sklearn <- reticulate::import("sklearn", convert = FALSE) + model <- do.call( + sklearn$ensemble$RandomForestClassifier, + reticulate::py_to_r(model_dict["params"]) + ) + estimators <- list() for (i in 1:length(model_dict$estimators_)) { estimators <- c(estimators, deSerializeDecisionTree(model_dict["estimators_"][i - 1])) } - + model$estimators_ <- np$array(estimators) - + model$classes_ <- np$array(model_dict["classes_"]) model$n_features_in_ <- model_dict["n_features_in_"] - model$n_outputs_ <- model_dict["n_outputs_"] + model$n_outputs_ <- model_dict["n_outputs_"] model$max_depth <- model_dict["max_depth"] model$min_samples_split <- model_dict["min_samples_split"] model$min_samples_leaf <- model_dict["min_samples_leaf"] @@ -218,12 +231,12 @@ deSerializeRandomForest <- function(model_dict) { model$min_impurity_decrease <- model_dict["min_impurity_decrease"] model$min_impurity_split <- model_dict["min_impurity_split"] model$n_classes_ <- model_dict["n_classes_"] - - if (reticulate::py_bool(model_dict$oob_score_ != reticulate::py_none())){ + + if (reticulate::py_bool(model_dict$oob_score_ != reticulate::py_none())) { model$oob_score_ <- model_dict["oob_score_"] - model$oob_decision_function_ <- model_dict["oob_decision_function_"] + model$oob_decision_function_ <- model_dict["oob_decision_function_"] } - return(model) + return(model) } serializeAdaboost <- function(model) { @@ -238,32 +251,35 @@ serializeAdaboost <- function(model) { "n_classes_" = model$n_classes_, "params" = model$get_params(), "classes_" = model$classes_$tolist(), - "estimator_weights_" = model$estimator_weights_$tolist()) - + "estimator_weights_" = model$estimator_weights_$tolist() + ) + return(serialized_model) } deSerializeAdaboost <- function(model_dict) { - np <- reticulate::import("numpy", convert=FALSE) - sklearn <- reticulate::import("sklearn", convert=FALSE) - model <- do.call(sklearn$ensemble$AdaBoostClassifier, - reticulate::py_to_r(model_dict["params"])) + np <- reticulate::import("numpy", convert = FALSE) + sklearn <- reticulate::import("sklearn", convert = FALSE) + model <- do.call( + sklearn$ensemble$AdaBoostClassifier, + reticulate::py_to_r(model_dict["params"]) + ) estimators <- list() for (i in 1:length(model_dict$estimators_)) { estimators <- c(estimators, deSerializeDecisionTree(model_dict["estimators_"][i - 1])) } - + model$estimators_ <- np$array(estimators) model$classes_ <- np$array(model_dict["classes_"]) model$n_features_in_ <- model_dict["n_features_in_"] model$n_classes_ <- model_dict["n_classes_"] model$estimator_weights_ <- np$array(model_dict["estimator_weights_"]) - + return(model) } serializeNaiveBayes <- function(model) { - serialized_model = reticulate::dict( + serialized_model <- reticulate::dict( "meta" = "naive-bayes", "classes_" = model$classes_$tolist(), "class_count_" = model$class_count_$tolist(), @@ -277,28 +293,34 @@ serializeNaiveBayes <- function(model) { } deSerializeNaiveBayes <- function(model_dict) { - sklearn <- reticulate::import("sklearn", convert=FALSE) - np <- reticulate::import("numpy", convert=FALSE) - model <- do.call(sklearn$naive_bayes$GaussianNB, - reticulate::py_to_r(model_dict["params"])) - + sklearn <- reticulate::import("sklearn", convert = FALSE) + np <- reticulate::import("numpy", convert = FALSE) + model <- do.call( + sklearn$naive_bayes$GaussianNB, + reticulate::py_to_r(model_dict["params"]) + ) + model$classes_ <- np$array(model_dict["classes_"]) model$class_count_ <- np$array(model_dict["class_count_"]) model$class_prior_ <- np$array(model_dict["class_prior_"]) model$theta_ <- np$array(model_dict["theta_"]) model$epsilon_ <- model_dict["epsilon_"] model$var_ <- np$array(model_dict["var_"]) - + return(model) } serializeMLP <- function(model) { # TODO Check if length(intercepts_) is ever different from length(coefs_) for (i in 0:(length(model$coefs_) - 1)) { - reticulate::py_set_item(model$coefs_, i, - model$coefs_[i]$tolist()) - reticulate::py_set_item(model$intercepts_, i, - model$intercepts_[i]$tolist()) + reticulate::py_set_item( + model$coefs_, i, + model$coefs_[i]$tolist() + ) + reticulate::py_set_item( + model$intercepts_, i, + model$intercepts_[i]$tolist() + ) } serialized_model <- reticulate::dict( "meta" = "mlp", @@ -316,32 +338,37 @@ serializeMLP <- function(model) { } deSerializeMlp <- function(model_dict) { - sklearn <- reticulate::import("sklearn", convert=FALSE) - np <- reticulate::import("numpy", convert=FALSE) - - model <- do.call(sklearn$neural_network$MLPClassifier, - reticulate::py_to_r(model_dict["params"])) + sklearn <- reticulate::import("sklearn", convert = FALSE) + np <- reticulate::import("numpy", convert = FALSE) + + model <- do.call( + sklearn$neural_network$MLPClassifier, + reticulate::py_to_r(model_dict["params"]) + ) for (i in 0:(length(model_dict["coefs_"]) - 1)) { - reticulate::py_set_item(model_dict["coefs_"], i, - np$array(model_dict["coefs_"][i])) - reticulate::py_set_item(model_dict["intercepts_"], i, - np$array(model_dict["intercepts_"][i])) - + reticulate::py_set_item( + model_dict["coefs_"], i, + np$array(model_dict["coefs_"][i]) + ) + reticulate::py_set_item( + model_dict["intercepts_"], i, + np$array(model_dict["intercepts_"][i]) + ) } - model$coefs_ = model_dict["coefs_"] - model$loss_ = model_dict["loss_"] - model$intercepts_ = model_dict["intercepts_"] - model$n_iter_ = model_dict["n_iter_"] - model$n_layers_ = model_dict["n_layers_"] - model$n_outputs_ = model_dict["n_outputs_"] - model$out_activation_ = model_dict["out_activation_"] - model$classes_ = np$array(model_dict["classes_"]) - + model$coefs_ <- model_dict["coefs_"] + model$loss_ <- model_dict["loss_"] + model$intercepts_ <- model_dict["intercepts_"] + model$n_iter_ <- model_dict["n_iter_"] + model$n_layers_ <- model_dict["n_layers_"] + model$n_outputs_ <- model_dict["n_outputs_"] + model$out_activation_ <- model_dict["out_activation_"] + model$classes_ <- np$array(model_dict["classes_"]) + return(model) } serializeSVM <- function(model) { - serialized_model = reticulate::dict( + serialized_model <- reticulate::dict( "meta" = "svm", "class_weight_" = model$class_weight_$tolist(), "classes_" = model$classes_$tolist(), @@ -360,84 +387,88 @@ serializeSVM <- function(model) { } else { serialized_model["support_vectors_"] <- serializeCsrMatrix(model$support_vectors_) } - + if (inherits(model$dual_coef_, "numpy.ndarray")) { serialized_model["dual_coef_"] <- model$dual_coef_$tolist() } else { serialized_model["dual_coef_"] <- serializeCsrMatrix(model$dual_coef_) } - + if (inherits(model$`_dual_coef_`, "numpy.ndarray")) { serialized_model["_dual_coef_"] <- model$`_dual_coef_`$tolist() } else { serialized_model["_dual_coef_"] <- serializeCsrMatrix(model$`_dual_coef_`) } - return(serialized_model) + return(serialized_model) } deSerializeSVM <- function(model_dict) { - sklearn <- reticulate::import("sklearn", convert=FALSE) - np <- reticulate::import("numpy", convert=FALSE) - model <- do.call(sklearn$svm$SVC, - reticulate::py_to_r(model_dict["params"])) + sklearn <- reticulate::import("sklearn", convert = FALSE) + np <- reticulate::import("numpy", convert = FALSE) + model <- do.call( + sklearn$svm$SVC, + reticulate::py_to_r(model_dict["params"]) + ) model$shape_fit_ <- model_dict$shape_fit_ - model$`_gamma`<- model_dict["_gamma"] + model$`_gamma` <- model_dict["_gamma"] model$class_weight_ <- np$array(model_dict$class_weight_)$astype(np$float64) - model$classes_ <- np$array(model_dict["classes_"]) + model$classes_ <- np$array(model_dict["classes_"]) model$support_ <- np$array(model_dict["support_"])$astype(np$int32) model$`_n_support` <- np$array(model_dict["n_support_"])$astype(np$int32) model$intercept_ <- np$array(model_dict["intercept_"])$astype(np$float64) model$`_probA` <- np$array(model_dict["probA_"])$astype(np$float64) model$`_probB` <- np$array(model_dict["probB_"])$astype(np$float64) model$`_intercept_` <- np$array(model_dict["_intercept_"])$astype(np$float64) - - if (reticulate::py_bool((model_dict$support_vectors_["meta"] != reticulate::py_none())) & - (reticulate::py_bool(model_dict$support_vectors_["meta"] == "csr"))) { + + if (reticulate::py_bool((model_dict$support_vectors_["meta"] != reticulate::py_none())) & + (reticulate::py_bool(model_dict$support_vectors_["meta"] == "csr"))) { model$support_vectors_ <- deSerializeCsrMatrix(model_dict$support_vectors_) model$`_sparse` <- TRUE } else { model$support_vectors_ <- np$array(model_dict$support_vectors_)$astype(np$float64) model$`_sparse` <- FALSE } - if (reticulate::py_bool((model_dict$dual_coef_["meta"] != reticulate::py_none())) & - (reticulate::py_bool(model_dict$dual_coef_["meta"] == "csr"))) { + if (reticulate::py_bool((model_dict$dual_coef_["meta"] != reticulate::py_none())) & + (reticulate::py_bool(model_dict$dual_coef_["meta"] == "csr"))) { model$dual_coef_ <- deSerializeCsrMatrix(model_dict$dual_coef_) } else { model$dual_coef_ <- np$array(model_dict$dual_coef_)$astype(np$float64) } - - if (reticulate::py_bool((model_dict$`_dual_coef_`["meta"] != reticulate::py_none())) & - (reticulate::py_bool(model_dict$`_dual_coef_`["meta"] == "csr"))) { + + if (reticulate::py_bool((model_dict$`_dual_coef_`["meta"] != reticulate::py_none())) & + (reticulate::py_bool(model_dict$`_dual_coef_`["meta"] == "csr"))) { model$`_dual_coef_` <- deSerializeCsrMatrix(model_dict$`dual_coef_`) } else { model$`_dual_coef_` <- np$array(model_dict$`_dual_coef_`)$astype(np$float64) } return(model) -} +} serializeCsrMatrix <- function(csr_matrix) { - serialized_csr_matrix = reticulate::dict( + serialized_csr_matrix <- reticulate::dict( "meta" = "csr", "indices" = csr_matrix$indices$tolist(), "indptr" = csr_matrix$indptr$tolist(), - "_shape"= csr_matrix$`_shape`) + "_shape" = csr_matrix$`_shape` + ) serialized_csr_matrix["data"] <- csr_matrix$data$tolist() return(serialized_csr_matrix) } -deSerializeCsrMatrix <- function(csr_dict, - data_type=np$float64, - indices_type=np$int32, - indptr_type=np$int32) { - sp <- reticulate::import("scipy", convert=FALSE) - np <- reticulate::import("numpy", convert=FALSE) +deSerializeCsrMatrix <- function(csr_dict, + data_type = np$float64, + indices_type = np$int32, + indptr_type = np$int32) { + sp <- reticulate::import("scipy", convert = FALSE) + np <- reticulate::import("numpy", convert = FALSE) csr_matrix <- sp$sparse$csr_matrix( - reticulate::tuple(list(np$array(csr_dict["data"])$astype(data_type), - np$array(csr_dict["indices"])$astype(indices_type), - np$array(csr_dict["indptr"])$astype(indptr_type))), - shape=csr_dict["shape"] + reticulate::tuple(list( + np$array(csr_dict["data"])$astype(data_type), + np$array(csr_dict["indices"])$astype(indices_type), + np$array(csr_dict["indptr"])$astype(indptr_type) + )), + shape = csr_dict["shape"] ) return(csr_matrix) } - \ No newline at end of file diff --git a/man/createFeatureEngineeringSettings.Rd b/man/createFeatureEngineeringSettings.Rd index 0b7a0a8d..9772a9a1 100644 --- a/man/createFeatureEngineeringSettings.Rd +++ b/man/createFeatureEngineeringSettings.Rd @@ -8,7 +8,7 @@ createFeatureEngineeringSettings(type = "none") } \arguments{ \item{type}{(character) Choice of: \itemize{ -\item'none' No feature engineering - this is the default +\item'none' No feature engineering - this is the default }} } \value{ diff --git a/man/setAdaBoost.Rd b/man/setAdaBoost.Rd index adcc4f6f..a13762b2 100644 --- a/man/setAdaBoost.Rd +++ b/man/setAdaBoost.Rd @@ -7,7 +7,7 @@ setAdaBoost( nEstimators = list(10, 50, 200), learningRate = list(1, 0.5, 0.1), - algorithm = list("SAMME.R"), + algorithm = list("SAMME"), seed = sample(1000000, 1) ) } @@ -17,7 +17,7 @@ setAdaBoost( \item{learningRate}{(list) Weight applied to each classifier at each boosting iteration. A higher learning rate increases the contribution of each classifier. There is a trade-off between the learningRate and nEstimators parameters There is a trade-off between learningRate and nEstimators.} -\item{algorithm}{(list) If ‘SAMME.R’ then use the SAMME.R real boosting algorithm. base_estimator must support calculation of class probabilities. If ‘SAMME’ then use the SAMME discrete boosting algorithm. The SAMME.R algorithm typically converges faster than SAMME, achieving a lower test error with fewer boosting iterations.} +\item{algorithm}{Only ‘SAMME’ can be provided. The 'algorithm' argument will be deprecated in scikit-learn 1.8.} \item{seed}{A seed for the model} } @@ -26,8 +26,9 @@ Create setting for AdaBoost with python DecisionTreeClassifier base estimator } \examples{ \dontrun{ -model.adaBoost <- setAdaBoost(nEstimators = list(10,50,200), learningRate = list(1, 0.5, 0.1), - algorithm = list('SAMME.R'), seed = sample(1000000,1) - ) +model.adaBoost <- setAdaBoost( + nEstimators = list(10, 50, 200), learningRate = list(1, 0.5, 0.1), + algorithm = list("SAMME.R"), seed = sample(1000000, 1) +) } } diff --git a/man/setDecisionTree.Rd b/man/setDecisionTree.Rd index 96f56e74..b5633772 100644 --- a/man/setDecisionTree.Rd +++ b/man/setDecisionTree.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/SklearnClassifierSettings.R \name{setDecisionTree} \alias{setDecisionTree} -\title{Create setting for the scikit-learn 1.0.1 DecisionTree with python} +\title{Create setting for the scikit-learn DecisionTree with python} \usage{ setDecisionTree( criterion = list("gini"), @@ -42,10 +42,10 @@ setDecisionTree( \item{seed}{The random state seed} } \description{ -Create setting for the scikit-learn 1.0.1 DecisionTree with python +Create setting for the scikit-learn DecisionTree with python } \examples{ \dontrun{ -model.decisionTree <- setDecisionTree(maxDepth=10,minSamplesLeaf=10, seed=NULL ) +model.decisionTree <- setDecisionTree(maxDepth = 10, minSamplesLeaf = 10, seed = NULL) } } diff --git a/man/setRandomForest.Rd b/man/setRandomForest.Rd index dbe7ad0f..7812e522 100644 --- a/man/setRandomForest.Rd +++ b/man/setRandomForest.Rd @@ -65,7 +65,9 @@ Create setting for random forest model with python (very fast) } \examples{ \dontrun{ -model.rf <- setRandomForest(mtries=list('auto',5,20), ntrees=c(10,100), - maxDepth=c(5,20)) +model.rf <- setRandomForest( + mtries = list("auto", 5, 20), ntrees = c(10, 100), + maxDepth = c(5, 20) +) } } diff --git a/man/setSVM.Rd b/man/setSVM.Rd index f21a202e..0b7af491 100644 --- a/man/setSVM.Rd +++ b/man/setSVM.Rd @@ -43,6 +43,6 @@ Create setting for the python sklearn SVM (SVC function) } \examples{ \dontrun{ -model.svm <- setSVM(kernel='rbf', seed = NULL) +model.svm <- setSVM(kernel = "rbf", seed = NULL) } } diff --git a/tests/testthat/test-sklearnClassifier.R b/tests/testthat/test-sklearnClassifier.R index c87a1937..7a1e51c8 100644 --- a/tests/testthat/test-sklearnClassifier.R +++ b/tests/testthat/test-sklearnClassifier.R @@ -1,134 +1,124 @@ +test_that("DecisionTree settings work checks", { + dtset <- setDecisionTree( + criterion = list("gini"), + splitter = list("best"), + maxDepth = list(4, 10, NULL), + minSamplesSplit = list(2, 10), + minSamplesLeaf = list(10, 50), + minWeightFractionLeaf = list(0), + maxFeatures = list(100, "sqrt", NULL), + maxLeafNodes = list(NULL), + minImpurityDecrease = list(10^-7), + classWeight = list(NULL), + seed = sample(1000000, 1) + ) + expect_equal(dtset$fitFunction, "fitSklearn") -test_that("DecisionTree settings work checks", { - -dtset <- setDecisionTree( - criterion = list('gini'), - splitter = list('best'), - maxDepth = list(4, 10, NULL), - minSamplesSplit = list(2, 10), - minSamplesLeaf = list(10, 50), - minWeightFractionLeaf = list(0), - maxFeatures = list(100,'sqrt', NULL), - maxLeafNodes = list(NULL), - minImpurityDecrease = list(10^-7), - classWeight = list(NULL), - seed = sample(1000000,1) -) - -expect_equal(dtset$fitFunction, "fitSklearn") - -expect_equal(length(dtset$param), 3*2*2*3*1) - -expect_equal(unique(unlist(lapply(dtset$param, function(x) x[[1]]))), 'gini') -expect_equal(unique(unlist(lapply(dtset$param, function(x) x[[2]]))), 'best') -expect_equal(length(unique(lapply(dtset$param, function(x) x[[3]]))), 3) - -expect_false(attr(dtset$param, 'settings')$requiresDenseMatrix) -expect_equal(attr(dtset$param, 'settings')$name, 'Decision Tree') -expect_equal(attr(dtset$param, 'settings')$pythonModule, 'sklearn.tree') -expect_equal(attr(dtset$param, 'settings')$pythonClass, "DecisionTreeClassifier") + expect_equal(length(dtset$param), 3 * 2 * 2 * 3 * 1) + expect_equal(unique(unlist(lapply(dtset$param, function(x) x[[1]]))), "gini") + expect_equal(unique(unlist(lapply(dtset$param, function(x) x[[2]]))), "best") + expect_equal(length(unique(lapply(dtset$param, function(x) x[[3]]))), 3) + expect_false(attr(dtset$param, "settings")$requiresDenseMatrix) + expect_equal(attr(dtset$param, "settings")$name, "Decision Tree") + expect_equal(attr(dtset$param, "settings")$pythonModule, "sklearn.tree") + expect_equal(attr(dtset$param, "settings")$pythonClass, "DecisionTreeClassifier") }) test_that("DecisionTree errors as expected", { - - expect_error(setDecisionTree(criterion = list('madeup'))) - - expect_error(setDecisionTree(maxDepth = list(-1))) - expect_error(setDecisionTree(minSamplesSplit = list(-1))) - expect_error(setDecisionTree(minSamplesLeaf = list(-1))) - + expect_error(setDecisionTree(criterion = list("madeup"))) + + expect_error(setDecisionTree(maxDepth = list(-1))) + expect_error(setDecisionTree(minSamplesSplit = list(-1))) + expect_error(setDecisionTree(minSamplesLeaf = list(-1))) }) test_that("check fit of DecisionTree", { - modelSettings <- setDecisionTree( - criterion = list('gini'), - splitter = list('best'), + criterion = list("gini"), + splitter = list("best"), maxDepth = list(as.integer(4)), minSamplesSplit = list(2), minSamplesLeaf = list(10), minWeightFractionLeaf = list(0), - maxFeatures = list('sqrt'), + maxFeatures = list("sqrt"), maxLeafNodes = list(NULL), minImpurityDecrease = list(10^-7), classWeight = list(NULL), - seed = sample(1000000,1) + seed = sample(1000000, 1) ) - + plpModel <- fitPlp( - trainData = tinyTrainData, + trainData = tinyTrainData, modelSettings = modelSettings, - analysisId = 'DecisionTree', + analysisId = "DecisionTree", analysisPath = tempdir() - ) - + ) + expect_correct_fitPlp(plpModel, trainData) # add check for other model design settings - }) -test_that('fitSklearn errors with wrong covariateData', { - +test_that("fitSklearn errors with wrong covariateData", { newTrainData <- copyTrainData(trainData) - class(newTrainData$covariateData) <- 'notCovariateData' + class(newTrainData$covariateData) <- "notCovariateData" modelSettings <- setAdaBoost() analysisId <- 42 - + expect_error(fitSklearn(newTrainData, - modelSettings, - search='grid', - analysisId)) + modelSettings, + search = "grid", + analysisId + )) }) -test_that('AdaBoost fit works', { - - modelSettings <- setAdaBoost(nEstimators = list(10), - learningRate = list(0.1), - ) - +test_that("AdaBoost fit works", { + modelSettings <- setAdaBoost( + nEstimators = list(10), + learningRate = list(0.1), + ) + plpModel <- fitPlp( - trainData = tinyTrainData, + trainData = tinyTrainData, modelSettings = modelSettings, - analysisId = 'Adaboost', + analysisId = "Adaboost", analysisPath = tempdir() ) - + expect_correct_fitPlp(plpModel, trainData) - expect_equal(dir(plpModel$model),"model.json") - + expect_equal(dir(plpModel$model), "model.json") }) -test_that('RandomForest fit works', { - - modelSettings <- setRandomForest(ntrees=list(10), - maxDepth=list(4), - minSamplesSplit = list(2), - minSamplesLeaf = list(10), - mtries = list("sqrt"), - maxSamples = list(0.9), - classWeight = list(NULL)) - +test_that("RandomForest fit works", { + modelSettings <- setRandomForest( + ntrees = list(10), + maxDepth = list(4), + minSamplesSplit = list(2), + minSamplesLeaf = list(10), + mtries = list("sqrt"), + maxSamples = list(0.9), + classWeight = list(NULL) + ) + plpModel <- fitPlp( - trainData = tinyTrainData, + trainData = tinyTrainData, modelSettings = modelSettings, - analysisId = 'RandomForest', + analysisId = "RandomForest", analysisPath = tempdir() ) - + expect_correct_fitPlp(plpModel, trainData) - expect_equal(dir(plpModel$model),"model.json") - + expect_equal(dir(plpModel$model), "model.json") }) -test_that('MLP fit works', { +test_that("MLP fit works", { modelSettings <- setMLP( hiddenLayerSizes = list(c(20)), alpha = list(1e-6), @@ -137,69 +127,70 @@ test_that('MLP fit works', { learningRateInit = list(0.01), tol = list(1e-2) # reduce tol so I don't get convergence warnings ) - + plpModel <- fitPlp( - trainData = tinyTrainData, + trainData = tinyTrainData, modelSettings = modelSettings, - analysisId = 'MLP', + analysisId = "MLP", analysisPath = tempdir() ) - + expect_correct_fitPlp(plpModel, trainData) - expect_equal(dir(plpModel$model),"model.json") - + expect_equal(dir(plpModel$model), "model.json") }) -test_that('Naive bayes fit works', { +test_that("Naive bayes fit works", { modelSettings <- setNaiveBayes() - + plpModel <- fitPlp( - trainData = tinyTrainData, + trainData = tinyTrainData, modelSettings = modelSettings, - analysisId = 'Naive bayes', + analysisId = "Naive bayes", analysisPath = tempdir() ) - - expect_correct_fitPlp(plpModel, trainData) - expect_equal(dir(plpModel$model),"model.json") - + + expect_correct_fitPlp(plpModel, trainData) + expect_equal(dir(plpModel$model), "model.json") }) -test_that('Support vector machine fit works', { - modelSettings <- setSVM(C = list(1), - degree = list(1), - gamma = list('scale'), - classWeight = list(NULL)) - +test_that("Support vector machine fit works", { + modelSettings <- setSVM( + C = list(1), + degree = list(1), + gamma = list("scale"), + classWeight = list(NULL) + ) + plpModel <- fitPlp( - trainData = tinyTrainData, + trainData = tinyTrainData, modelSettings = modelSettings, - analysisId = 'SVM', + analysisId = "SVM", analysisPath = tempdir() ) - - expect_correct_fitPlp(plpModel, trainData) - expect_equal(dir(plpModel$model),"model.json") - + + expect_correct_fitPlp(plpModel, trainData) + expect_equal(dir(plpModel$model), "model.json") }) -test_that('Sklearn predict works', { - - modelSettings <- setAdaBoost(nEstimators = list(10), - learningRate = list(0.1), +test_that("Sklearn predict works", { + modelSettings <- setAdaBoost( + nEstimators = list(10), + learningRate = list(0.1), ) - + plpModel <- fitPlp( - trainData = tinyTrainData, + trainData = tinyTrainData, modelSettings = modelSettings, - analysisId = 'Adaboost', + analysisId = "Adaboost", analysisPath = tempdir() ) - - predictions <- predictPythonSklearn(plpModel, - testData, - population) + + predictions <- predictPythonSklearn( + plpModel, + testData, + population + ) expect_correct_predictions(predictions, testData) }) diff --git a/tests/testthat/test-sklearnClassifierHelpers.R b/tests/testthat/test-sklearnClassifierHelpers.R index 7467fb73..0b2e698d 100644 --- a/tests/testthat/test-sklearnClassifierHelpers.R +++ b/tests/testthat/test-sklearnClassifierHelpers.R @@ -1,11 +1,9 @@ test_that("listCartesian works", { - - allList <- list(a=list(1,2), b=list(NULL, 'auto'), c=list(-1)) - + allList <- list(a = list(1, 2), b = list(NULL, "auto"), c = list(-1)) + paramLists <- listCartesian(allList) - - expect_equal(length(paramLists), 2*2*1) - expect_equal(names(paramLists[[1]]), c('a', 'b', 'c')) + + expect_equal(length(paramLists), 2 * 2 * 1) + expect_equal(names(paramLists[[1]]), c("a", "b", "c")) expect_equal(length(paramLists[[1]]), 3) - }) diff --git a/tests/testthat/test-sklearnClassifierSettings.R b/tests/testthat/test-sklearnClassifierSettings.R index 29c3f7f3..ab56a389 100644 --- a/tests/testthat/test-sklearnClassifierSettings.R +++ b/tests/testthat/test-sklearnClassifierSettings.R @@ -1,57 +1,52 @@ test_that("setAdaBoost settings work checks", { - adset <- setAdaBoost( - nEstimators = list(10,50, 200), - learningRate = list(1, 0.5, 0.1), - algorithm = list('SAMME.R'), - seed = sample(1000000,1) + nEstimators = list(10, 50, 200), + learningRate = list(1, 0.5, 0.1), + algorithm = list("SAMME"), + seed = sample(1000000, 1) ) - + expect_equal(adset$fitFunction, "fitSklearn") - - expect_equal(length(adset$param), 3*3*1) - - expect_equal(unique(unlist(lapply(adset$param, function(x) x[[1]]))), c(10,50, 200)) + + expect_equal(length(adset$param), 3 * 3 * 1) + + expect_equal(unique(unlist(lapply(adset$param, function(x) x[[1]]))), c(10, 50, 200)) expect_equal(unique(unlist(lapply(adset$param, function(x) x[[2]]))), c(1, 0.5, 0.1)) - expect_equal(unique(lapply(adset$param, function(x) x[[3]])), list('SAMME.R')) - - expect_false(attr(adset$param, 'settings')$requiresDenseMatrix) - expect_equal(attr(adset$param, 'settings')$name, 'AdaBoost') - expect_equal(attr(adset$param, 'settings')$pythonModule, 'sklearn.ensemble') - expect_equal(attr(adset$param, 'settings')$pythonClass, "AdaBoostClassifier") - - + expect_equal(unique(lapply(adset$param, function(x) x[[3]])), list("SAMME")) + + expect_false(attr(adset$param, "settings")$requiresDenseMatrix) + expect_equal(attr(adset$param, "settings")$name, "AdaBoost") + expect_equal(attr(adset$param, "settings")$pythonModule, "sklearn.ensemble") + expect_equal(attr(adset$param, "settings")$pythonClass, "AdaBoostClassifier") + + inputs <- AdaBoostClassifierInputs(list, adset$param[[1]]) expect_equal( - names(inputs), - c("n_estimators","learning_rate","algorithm","random_state" ) - ) - + names(inputs), + c("n_estimators", "learning_rate", "algorithm", "random_state") + ) }) test_that("setAdaBoost errors as expected", { - expect_error(setAdaBoost(nEstimators = list(-1))) - expect_error(setAdaBoost(learningRate = list(-1))) - expect_error(setAdaBoost(algorithm = list(-1))) - expect_error(setAdaBoost(seed = list('seed'))) - + expect_error(setAdaBoost(learningRate = list(-1))) + expect_error(setAdaBoost(algorithm = list(-1))) + expect_error(setAdaBoost(seed = list("seed"))) }) test_that("setMLP settings work checks", { - mlpset <- setMLP( - hiddenLayerSizes = list(c(100), c(20,4)), #must be integers - activation = list('relu'), - solver = list('adam'), - alpha = list(0.3,0.01,0.0001,0.000001), - batchSize = list('auto'), - learningRate = list('constant'), + hiddenLayerSizes = list(c(100), c(20, 4)), # must be integers + activation = list("relu"), + solver = list("adam"), + alpha = list(0.3, 0.01, 0.0001, 0.000001), + batchSize = list("auto"), + learningRate = list("constant"), learningRateInit = list(0.001), powerT = list(0.5), - maxIter = list(200, 100), + maxIter = list(200, 100), shuffle = list(TRUE), tol = list(0.0001), warmStart = list(TRUE), @@ -59,69 +54,67 @@ test_that("setMLP settings work checks", { nesterovsMomentum = list(TRUE), earlyStopping = list(FALSE), validationFraction = list(0.1), - beta1 = list(0.9), - beta2 = list(0.999), - epsilon = list(1,0.1,0.00000001), + beta1 = list(0.9), + beta2 = list(0.999), + epsilon = list(1, 0.1, 0.00000001), nIterNoChange = list(10), - seed = sample(100000,1) + seed = sample(100000, 1) ) - + expect_equal(mlpset$fitFunction, "fitSklearn") - - expect_equal(length(mlpset$param), 2*4*2*3) - - expect_equal(unique(lapply(mlpset$param, function(x) x[[1]])), list(c(100), c(20,4))) - expect_equal(unique(unlist(lapply(mlpset$param, function(x) x[[2]]))), 'relu') - expect_equal(unique(unlist(lapply(mlpset$param, function(x) x[[4]]))), c(0.3,0.01,0.0001,0.000001)) + + expect_equal(length(mlpset$param), 2 * 4 * 2 * 3) + + expect_equal(unique(lapply(mlpset$param, function(x) x[[1]])), list(c(100), c(20, 4))) + expect_equal(unique(unlist(lapply(mlpset$param, function(x) x[[2]]))), "relu") + expect_equal(unique(unlist(lapply(mlpset$param, function(x) x[[4]]))), c(0.3, 0.01, 0.0001, 0.000001)) expect_equal(unique(lapply(mlpset$param, function(x) x[[9]])), list(200, 100)) - - expect_false(attr(mlpset$param, 'settings')$requiresDenseMatrix) - expect_equal(attr(mlpset$param, 'settings')$name, 'Neural Network') - expect_equal(attr(mlpset$param, 'settings')$pythonModule, 'sklearn.neural_network') - expect_equal(attr(mlpset$param, 'settings')$pythonClass, "MLPClassifier") - + + expect_false(attr(mlpset$param, "settings")$requiresDenseMatrix) + expect_equal(attr(mlpset$param, "settings")$name, "Neural Network") + expect_equal(attr(mlpset$param, "settings")$pythonModule, "sklearn.neural_network") + expect_equal(attr(mlpset$param, "settings")$pythonClass, "MLPClassifier") + inputs <- MLPClassifierInputs(list, mlpset$param[[1]]) expect_equal( - names(inputs), - c("hidden_layer_sizes", "activation", "solver", "alpha", "batch_size", + names(inputs), + c( + "hidden_layer_sizes", "activation", "solver", "alpha", "batch_size", "learning_rate", "learning_rate_init", "power_t", "max_iter", "shuffle", "random_state", "tol", "verbose", "warm_start", "momentum", "nesterovs_momentum", "early_stopping", "validation_fraction", "beta_1", "beta_2", "epsilon", - "n_iter_no_change" ) + "n_iter_no_change" + ) ) }) test_that("setNaiveBayes settings work checks", { - - nbset <- setNaiveBayes( - ) - + nbset <- setNaiveBayes() + expect_equal(nbset$fitFunction, "fitSklearn") - + expect_equal(length(nbset$param), 1) - - expect_true(attr(nbset$param, 'settings')$requiresDenseMatrix) - expect_equal(attr(nbset$param, 'settings')$name, 'Naive Bayes') - expect_equal(attr(nbset$param, 'settings')$pythonModule, 'sklearn.naive_bayes') - expect_equal(attr(nbset$param, 'settings')$pythonClass, "GaussianNB") - + + expect_true(attr(nbset$param, "settings")$requiresDenseMatrix) + expect_equal(attr(nbset$param, "settings")$name, "Naive Bayes") + expect_equal(attr(nbset$param, "settings")$pythonModule, "sklearn.naive_bayes") + expect_equal(attr(nbset$param, "settings")$pythonClass, "GaussianNB") + inputs <- GaussianNBInputs(list, nbset$param[[1]]) - expect_equal(names(inputs),NULL) - + expect_equal(names(inputs), NULL) }) test_that("setRandomForest settings work checks", { - rfset <- setRandomForest( - ntrees = list(100,500), - criterion = list('gini'), - maxDepth = list(4,10,17), - minSamplesSplit = list(2,5), - minSamplesLeaf = list(1,10), + ntrees = list(100, 500), + criterion = list("gini"), + maxDepth = list(4, 10, 17), + minSamplesSplit = list(2, 5), + minSamplesLeaf = list(1, 10), minWeightFractionLeaf = list(0), - mtries = list('sqrt', 'log2'), + mtries = list("sqrt", "log2"), maxLeafNodes = list(NULL), minImpurityDecrease = list(0), bootstrap = list(TRUE), @@ -129,65 +122,68 @@ test_that("setRandomForest settings work checks", { oobScore = list(FALSE), nJobs = list(NULL), classWeight = list(NULL), - seed = sample(100000,1) + seed = sample(100000, 1) ) - + expect_equal(rfset$fitFunction, "fitSklearn") - - expect_equal(length(rfset$param), 2*3*2*2*2*2*1) - - expect_equal(unique(lapply(rfset$param, function(x) x[[1]])), list(100,500)) - expect_equal(unique(unlist(lapply(rfset$param, function(x) x[[3]]))), c(4,10,17)) - - expect_false(attr(rfset$param, 'settings')$requiresDenseMatrix) - expect_equal(attr(rfset$param, 'settings')$name, 'Random forest') - expect_equal(attr(rfset$param, 'settings')$pythonModule, 'sklearn.ensemble') - expect_equal(attr(rfset$param, 'settings')$pythonClass, "RandomForestClassifier") - - inputs <- RandomForestClassifierInputs(list, rfset$param[[1]]) + + expect_equal(length(rfset$param), 2 * 3 * 2 * 2 * 2 * 2 * 1) + + expect_equal(unique(lapply(rfset$param, function(x) x[[1]])), list(100, 500)) + expect_equal(unique(unlist(lapply(rfset$param, function(x) x[[3]]))), c(4, 10, 17)) + + expect_false(attr(rfset$param, "settings")$requiresDenseMatrix) + expect_equal(attr(rfset$param, "settings")$name, "Random forest") + expect_equal(attr(rfset$param, "settings")$pythonModule, "sklearn.ensemble") + expect_equal(attr(rfset$param, "settings")$pythonClass, "RandomForestClassifier") + + inputs <- RandomForestClassifierInputs(list, rfset$param[[1]]) expect_equal( - names(inputs), - c("n_estimators", "criterion", "max_depth", "min_samples_split", "min_samples_leaf", + names(inputs), + c( + "n_estimators", "criterion", "max_depth", "min_samples_split", "min_samples_leaf", "min_weight_fraction_leaf", "max_features", "max_leaf_nodes", "min_impurity_decrease", "bootstrap", "max_samples", "oob_score", "n_jobs", "random_state", "verbose", - "warm_start","class_weight") + "warm_start", "class_weight" + ) ) }) test_that("setSVM settings work checks", { - - svmset <- setSVM ( - C = list(1,0.9,2,0.1), - kernel = list('rbf'), - degree = list(1,3,5), - gamma = list('scale', 1e-04, 3e-05, 0.001, 0.01, 0.25), + svmset <- setSVM( + C = list(1, 0.9, 2, 0.1), + kernel = list("rbf"), + degree = list(1, 3, 5), + gamma = list("scale", 1e-04, 3e-05, 0.001, 0.01, 0.25), coef0 = list(0.0), - shrinking = list(TRUE), + shrinking = list(TRUE), tol = list(0.001), - classWeight = list(NULL), - cacheSize = 500, - seed = sample(100000,1) + classWeight = list(NULL), + cacheSize = 500, + seed = sample(100000, 1) ) - + expect_equal(svmset$fitFunction, "fitSklearn") - - expect_equal(length(svmset$param), 4*3*6*1) - - expect_equal(unique(lapply(svmset$param, function(x) x[[4]])), list('scale', 1e-04, 3e-05, 0.001, 0.01, 0.25)) - expect_equal(unique(unlist(lapply(svmset$param, function(x) x[[1]]))), c(1,0.9,2,0.1)) - - expect_false(attr(svmset$param, 'settings')$requiresDenseMatrix) - expect_equal(attr(svmset$param, 'settings')$name, 'Support Vector Machine') - expect_equal(attr(svmset$param, 'settings')$pythonModule, 'sklearn.svm') - expect_equal(attr(svmset$param, 'settings')$pythonClass, "SVC") - - inputs <- SVCInputs(list, svmset$param[[1]]) + + expect_equal(length(svmset$param), 4 * 3 * 6 * 1) + + expect_equal(unique(lapply(svmset$param, function(x) x[[4]])), list("scale", 1e-04, 3e-05, 0.001, 0.01, 0.25)) + expect_equal(unique(unlist(lapply(svmset$param, function(x) x[[1]]))), c(1, 0.9, 2, 0.1)) + + expect_false(attr(svmset$param, "settings")$requiresDenseMatrix) + expect_equal(attr(svmset$param, "settings")$name, "Support Vector Machine") + expect_equal(attr(svmset$param, "settings")$pythonModule, "sklearn.svm") + expect_equal(attr(svmset$param, "settings")$pythonClass, "SVC") + + inputs <- SVCInputs(list, svmset$param[[1]]) expect_equal( - names(inputs), - c("C", "kernel", "degree", "gamma", "coef0", - "shrinking", "probability", "tol", "cache_size", + names(inputs), + c( + "C", "kernel", "degree", "gamma", "coef0", + "shrinking", "probability", "tol", "cache_size", "class_weight", "verbose", "max_iter", "decision_function_shape", - "break_ties", "random_state") + "break_ties", "random_state" + ) ) })