From 766bccd674bde31a2dcc5e15d512844abfa80a31 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Fri, 26 Jul 2024 10:36:52 +0200 Subject: [PATCH 01/34] tests: LearnerRegrAuto --- tests/testthat/test_LearnerRegrAuto.R | 296 ++++++++++++++++++++++++++ 1 file changed, 296 insertions(+) create mode 100644 tests/testthat/test_LearnerRegrAuto.R diff --git a/tests/testthat/test_LearnerRegrAuto.R b/tests/testthat/test_LearnerRegrAuto.R new file mode 100644 index 0000000..2bcf1e6 --- /dev/null +++ b/tests/testthat/test_LearnerRegrAuto.R @@ -0,0 +1,296 @@ +test_that("glmnet works (regr)", { + rush_plan(n_workers = 2) + skip_if_not_installed("glmnet") + + task = tsk("boston_housing") + learner = lrn("regr.auto", + learner_ids = "glmnet", + small_data_size = 1, + resampling = rsmp("holdout"), + measure = msr("regr.mse"), + terminator = trm("evals", n_evals = 6) + ) + + expect_class(learner$train(task), "LearnerRegrAuto") + expect_equal(learner$graph$param_set$values$branch.selection, "glmnet") + expect_equal(learner$model$instance$result$branch.selection, "glmnet") +}) + +test_that("kknn works (regr)", { + rush_plan(n_workers = 2) + skip_if_not_installed("kknn") + + task = tsk("boston_housing") + learner = lrn("regr.auto", + learner_ids = "kknn", + small_data_size = 1, + resampling = rsmp("holdout"), + measure = msr("regr.mse"), + terminator = trm("evals", n_evals = 6) + ) + + expect_class(learner$train(task), "LearnerRegrAuto") + expect_equal(learner$graph$param_set$values$branch.selection, "kknn") + expect_equal(learner$model$instance$result$branch.selection, "kknn") +}) + +test_that("nnet works (regr)", { + rush_plan(n_workers = 2) + skip_if_not_installed("nnet") + + task = tsk("boston_housing") + learner = lrn("regr.auto", + learner_ids = "nnet", + resampling = rsmp("holdout"), + small_data_size = 1, + measure = msr("regr.mse"), + terminator = trm("evals", n_evals = 6) + ) + + expect_class(learner$train(task), "LearnerRegrAuto") + expect_equal(learner$model$instance$result$branch.selection, "nnet") +}) + +test_that("ranger works (regr)", { + rush_plan(n_workers = 2) + skip_if_not_installed("ranger") + + + task = tsk("boston_housing") + learner = lrn("regr.auto", + learner_ids = "ranger", + small_data_size = 1, + resampling = rsmp("holdout"), + measure = msr("regr.mse"), + terminator = trm("evals", n_evals = 6) + ) + + expect_class(learner$train(task), "LearnerRegrAuto") + expect_equal(learner$model$instance$result$branch.selection, "ranger") +}) + +test_that("svm works (regr)", { + rush_plan(n_workers = 2) + skip_if_not_installed("e1071") + + + task = tsk("boston_housing") + learner = lrn("regr.auto", + learner_ids = "svm", + small_data_size = 1, + resampling = rsmp("holdout"), + measure = msr("regr.mse"), + terminator = trm("evals", n_evals = 6) + ) + + expect_class(learner$train(task), "LearnerRegrAuto") + expect_equal(learner$model$instance$result$branch.selection, "svm") +}) + +test_that("xgboost works (regr)", { + skip_if_not_installed("xgboost") + rush_plan(n_workers = 2) + + task = tsk("boston_housing") + learner = lrn("regr.auto", + learner_ids = "xgboost", + small_data_size = 1, + xgboost_eval_metric = "mlogloss", + resampling = rsmp("holdout"), + measure = msr("regr.mse"), + terminator = trm("evals", n_evals = 6) + ) + + expect_class(learner$train(task), "LearnerRegrAuto") + expect_equal(learner$model$instance$result$branch.selection, "xgboost") +}) + +test_that("catboost works (regr)", { + skip_if_not_installed("catboost") + rush_plan(n_workers = 2) + + task = tsk("boston_housing") + learner = lrn("regr.auto", + learner_ids = "catboost", + small_data_size = 1, + # catboost_eval_metric = "MultiClass", + resampling = rsmp("holdout"), + measure = msr("regr.mse"), + terminator = trm("evals", n_evals = 6) + ) + + expect_class(learner$train(task), "LearnerRegrAuto") + expect_equal(learner$model$instance$result$branch.selection, "catboost") +}) + +test_that("only extra_trees fails", { + rush_plan(n_workers = 2) + + task = tsk("boston_housing") + expect_error(lrn("regr.auto", + learner_ids = "extra_trees", + resampling = rsmp("holdout"), + measure = msr("regr.mse"), + terminator = trm("evals", n_evals = 6) + ), "must be combined with other learners") +}) + +test_that("extra_trees and glmnet works (regr)", { + skip_if_not_installed("glmnet") + rush_plan(n_workers = 2) + + task = tsk("boston_housing") + learner = lrn("regr.auto", + learner_ids = c("extra_trees", "glmnet"), + resampling = rsmp("holdout"), + measure = msr("regr.mse"), + terminator = trm("evals", n_evals = 6) + ) + + expect_class(learner$train(task), "LearnerRegrAuto") + expect_equal(learner$model$instance$result$branch.selection, "extra_trees") +}) + +test_that("lightgbm works (regr)", { + skip_if_not_installed("lightgbm") + rush_plan(n_workers = 2) + + task = tsk("boston_housing") + learner = lrn("regr.auto", + learner_ids = "lightgbm", + lightgbm_eval_metric = "multi_logloss", + resampling = rsmp("holdout"), + measure = msr("regr.mse"), + terminator = trm("evals", n_evals = 6) + ) + + expect_class(learner$train(task), "LearnerRegrAuto") + expect_equal(learner$model$instance$result$branch.selection, "lightgbm") +}) + +test_that("xgboost, catboost and lightgbm work (regr)", { + skip_if_not_installed(c("xgboost", "catboost", "lightgbm")) + rush_plan(n_workers = 2) + + task = tsk("boston_housing") + learner = lrn("regr.auto", + learner_ids = c("xgboost", "catboost", "lightgbm"), + catboost_eval_metric = "MultiClass", + lightgbm_eval_metric = "multi_logloss", + xgboost_eval_metric = "mlogloss", + resampling = rsmp("holdout"), + lhs_size = 1, + measure = msr("regr.mse"), + terminator = trm("evals", n_evals = 20), + callbacks = clbk("mlr3tuning.async_save_logs") + ) + + expect_class(learner$train(task), "LearnerRegrAuto") +}) + +test_that("all learner work (regr)", { + skip_if_not_installed(c("glmnet", "kknn", "nnet", "ranger", "e1071", "xgboost", "catboost", "MASS", "lightgbm")) + rush_plan(n_workers = 2) + + task = tsk("boston_housing") + learner = lrn("regr.auto", + small_data_size = 100, + measure = msr("regr.mse"), + terminator = trm("evals", n_evals = 20), + lhs_size = 1 + ) + + expect_class(learner$train(task), "LearnerRegrAuto") + expect_class(learner$model$instance, "TuningInstanceAsyncSingleCrit") + expect_prediction(learner$predict(task)) +}) + +# test_that("memory limit works", { +# skip_if_not_installed(c("glmnet", "kknn", "nnet", "ranger", "e1071", "xgboost", "catboost", "MASS", "lightgbm")) +# rush_plan(n_workers = 2) + +# task = tsk("spam") +# learner = lrn("regr.auto", +# max_memory = 50, +# small_data_size = 100, +# measure = msr("regr.mse"), +# terminator = trm("evals", n_evals = 20), +# resampling = rsmp("holdout"), +# lhs_size = 1 +# ) + +# learner$train(task) +# }) + +test_that("small data set switch works (regr)", { + skip_if_not_installed(c("glmnet", "kknn", "nnet", "ranger", "e1071", "xgboost", "catboost", "lightgbm")) + rush_plan(n_workers = 2) + + task = tsk("boston_housing") + learner = lrn("regr.auto", + small_data_size = 1000, + small_data_resampling = rsmp("cv", folds = 2), + measure = msr("regr.mse"), + terminator = trm("evals", n_evals = 1), + lhs_size = 1, + store_benchmark_result = TRUE + ) + + expect_class(learner$train(task), "LearnerRegrAuto") + expect_equal(learner$model$instance$archive$benchmark_result$resamplings$resampling[[1]]$iters, 2) +}) + +test_that("large data set switch works (regr)", { + skip_if_not_installed(c("glmnet", "kknn", "nnet", "ranger", "e1071", "xgboost", "catboost", "lightgbm")) + rush_plan(n_workers = 2) + + task = tsk("boston_housing") + learner = lrn("regr.auto", + large_data_size = 100, + large_data_nthread = 4, + large_data_learner_ids = "ranger", + small_data_size = 100, + measure = msr("regr.mse"), + terminator = trm("evals", n_evals = 1), + lhs_size = 1, + store_benchmark_result = TRUE + ) + + expect_class(learner$train(task), "LearnerRegrAuto") + expect_set_equal(learner$model$instance$archive$data$branch.selection, "ranger") +}) + +test_that("max_cardinality works (regr)", { + skip_if_not_installed(c("glmnet", "kknn", "nnet", "ranger", "e1071", "xgboost", "catboost", "lightgbm")) + rush_plan(n_workers = 2) + + task = tsk("boston_housing") + learner = lrn("regr.auto", + small_data_size = 1, + resampling = rsmp("holdout"), + max_cardinality = 2, + measure = msr("regr.mse"), + terminator = trm("evals", n_evals = 1), + lhs_size = 1 + ) + + expect_class(learner$train(task), "LearnerRegrAuto") +}) + +test_that("max_cardinality works for extra trees (regr)", { + skip_if_not_installed(c("glmnet", "kknn", "nnet", "ranger", "e1071", "xgboost", "catboost", "lightgbm")) + rush_plan(n_workers = 2) + + task = tsk("boston_housing") + learner = lrn("regr.auto", + small_data_size = 1, + resampling = rsmp("holdout"), + max_cardinality = 3, + extra_trees_max_cardinality = 2, + measure = msr("regr.mse"), + terminator = trm("evals", n_evals = 1), + lhs_size = 1 + ) + + expect_class(learner$train(task), "LearnerRegrAuto") +}) From 30de063b99bb5d07458e70f582ccc6f3feca0213 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Fri, 26 Jul 2024 10:48:41 +0200 Subject: [PATCH 02/34] feat: LearnerRegrAuto --- R/LearnerClassifAuto.R | 127 +----------------------- R/LearnerRegrAuto.R | 214 ++++++++++++++++++++++++++++++++++++++++- R/helper.R | 139 ++++++++++++++++++++++++++ 3 files changed, 353 insertions(+), 127 deletions(-) diff --git a/R/LearnerClassifAuto.R b/R/LearnerClassifAuto.R index b3b126e..132ab31 100644 --- a/R/LearnerClassifAuto.R +++ b/R/LearnerClassifAuto.R @@ -89,7 +89,7 @@ LearnerClassifAuto = R6Class("LearnerClassifAuto", .train = function(task) { pv = self$param_set$values learner_ids = pv$learner_ids - self$graph = build_graph(learner_ids) + self$graph = build_graph(learner_ids, task_type = "classif") self$tuning_space = tuning_space[learner_ids] lg$debug("Training '%s' on task '%s'", self$id, task$id) @@ -252,131 +252,6 @@ LearnerClassifAuto = R6Class("LearnerClassifAuto", #' @include aaa.R learners[["classif.auto"]] = LearnerClassifAuto -build_graph = function(learner_ids) { - branches = list() - # glmnet - if ("glmnet" %in% learner_ids) { - branch_glmnet = po("removeconstants", id = "glmnet_removeconstants") %>>% - po("imputehist", id = "glmnet_imputehist") %>>% - po("imputeoor", id = "glmnet_imputeoor") %>>% - po("fixfactors", id = "glmnet_fixfactors") %>>% - po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "glmnet_imputesample") %>>% - po("collapsefactors", target_level_count = 100, id = "glmnet_collapse") %>>% - po("encode", method = "one-hot", id = "glmnet_encode") %>>% - po("removeconstants", id = "glmnet_post_removeconstants") %>>% - lrn("classif.glmnet", id = "glmnet") - branches = c(branches, branch_glmnet) - } - - # kknn - if ("kknn" %in% learner_ids) { - branch_kknn = po("removeconstants", id = "kknn_removeconstants") %>>% - po("imputehist", id = "kknn_imputehist") %>>% - po("imputeoor", id = "kknn_imputeoor") %>>% - po("fixfactors", id = "kknn_fixfactors") %>>% - po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "kknn_imputesample") %>>% - po("collapsefactors", target_level_count = 100, id = "kknn_collapse") %>>% - po("removeconstants", id = "kknn_post_removeconstants") %>>% - lrn("classif.kknn", id = "kknn") - branches = c(branches, branch_kknn) - } - - # lda - if ("lda" %in% learner_ids) { - branch_lda = po("removeconstants", id = "lda_removeconstants") %>>% - po("imputehist", id = "lda_imputehist") %>>% - po("imputeoor", id = "lda_imputeoor") %>>% - po("fixfactors", id = "lda_fixfactors") %>>% - po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "lda_imputesample") %>>% - po("collapsefactors", target_level_count = 100, id = "lda_collapse") %>>% - po("removeconstants", id = "lda_post_removeconstants") %>>% - lrn("classif.lda", id = "lda") - branches = c(branches, branch_lda) - } - - # nnet - if ("nnet" %in% learner_ids) { - branch_nnet = po("removeconstants", id = "nnet_removeconstants") %>>% - po("imputehist", id = "nnet_imputehist") %>>% - po("imputeoor", id = "nnet_imputeoor") %>>% - po("fixfactors", id = "nnet_fixfactors") %>>% - po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "nnet_imputesample") %>>% - po("collapsefactors", target_level_count = 100, id = "nnet_collapse") %>>% - po("removeconstants", id = "nnet_post_removeconstants") %>>% - lrn("classif.nnet", id = "nnet") - branches = c(branches, branch_nnet) - } - - # ranger - if ("ranger" %in% learner_ids) { - branch_ranger = po("removeconstants", id = "ranger_removeconstants") %>>% - po("imputeoor", id = "ranger_imputeoor") %>>% - po("fixfactors", id = "ranger_fixfactors") %>>% - po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "ranger_imputesample") %>>% - po("collapsefactors", target_level_count = 100, id = "ranger_collapse") %>>% - po("removeconstants", id = "ranger_post_removeconstants") %>>% - # use upper bound of search space for memory estimation - lrn("classif.ranger", id = "ranger", num.trees = 2000) - branches = c(branches, branch_ranger) - } - - # svm - if ("svm" %in% learner_ids) { - branch_svm = po("removeconstants", id = "svm_removeconstants") %>>% - po("imputehist", id = "svm_imputehist") %>>% - po("imputeoor", id = "svm_imputeoor") %>>% - po("fixfactors", id = "svm_fixfactors") %>>% - po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "svm_imputesample") %>>% - po("collapsefactors", target_level_count = 100, id = "svm_collapse") %>>% - po("encode", method = "one-hot", id = "svm_encode") %>>% - po("removeconstants", id = "svm_post_removeconstants") %>>% - lrn("classif.svm", id = "svm", type = "C-classification") - branches = c(branches, branch_svm) - } - - # xgboost - if ("xgboost" %in% learner_ids) { - branch_xgboost = po("removeconstants", id = "xgboost_removeconstants") %>>% - po("imputeoor", id = "xgboost_imputeoor") %>>% - po("fixfactors", id = "xgboost_fixfactors") %>>% - po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "xgboost_imputesample") %>>% - po("encodeimpact", id = "xgboost_encode") %>>% - po("removeconstants", id = "xgboost_post_removeconstants") %>>% - lrn("classif.xgboost", id = "xgboost", nrounds = 5000, early_stopping_rounds = 10) - branches = c(branches, branch_xgboost) - } - - # catboost - if ("catboost" %in% learner_ids) { - branch_catboost = po("colapply", id = "catboost_colapply", applicator = as.numeric, affect_columns = selector_type("integer")) %>>% - lrn("classif.catboost", id = "catboost", iterations = 500, early_stopping_rounds = 10, use_best_model = TRUE) - branches = c(branches, branch_catboost) - } - - # extra trees - if ("extra_trees" %in% learner_ids) { - branch_extra_trees = po("removeconstants", id = "extra_trees_removeconstants") %>>% - po("imputeoor", id = "extra_trees_imputeoor") %>>% - po("fixfactors", id = "extra_trees_fixfactors") %>>% - po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "extra_trees_imputesample") %>>% - po("collapsefactors", target_level_count = 40, id = "extra_trees_collapse") %>>% - po("removeconstants", id = "extra_trees_post_removeconstants") %>>% - lrn("classif.ranger", id = "extra_trees", splitrule = "extratrees", num.trees = 100, replace = FALSE, sample.fraction = 1) - branches = c(branches, branch_extra_trees) - } - - # lightgbm - if ("lightgbm" %in% learner_ids) { - branch_lightgbm = lrn("classif.lightgbm", id = "lightgbm", num_iterations = 5000, early_stopping_rounds = 10) - branches = c(branches, branch_lightgbm) - } - - # branch graph - po("branch", options = learner_ids) %>>% - gunion(branches) %>>% - po("unbranch", options = learner_ids) -} - tuning_space = list( glmnet = list( glmnet.s = to_tune(1e-4, 1e4, logscale = TRUE), diff --git a/R/LearnerRegrAuto.R b/R/LearnerRegrAuto.R index bee6e86..9db316b 100644 --- a/R/LearnerRegrAuto.R +++ b/R/LearnerRegrAuto.R @@ -23,16 +23,228 @@ LearnerRegrAuto = R6Class("LearnerRegrAuto", #' @description #' Creates a new instance of this [R6][R6::R6Class] class. initialize = function(id = "classif.auto") { + param_set = ps( + # learner + learner_ids = p_uty(default = c("glmnet", "kknn", "lda", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm"), + custom_check = function(x) { + if (all(x %in% c("lda", "extra_trees"))) { + return("Learner 'lda' and 'extra_trees' must be combined with other learners") + } + check_subset(x, c("glmnet", "kknn", "lda", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm")) + }), + learner_timeout = p_int(lower = 1L, default = 900L), + xgboost_eval_metric = p_uty(), + catboost_eval_metric = p_uty(), + lightgbm_eval_metric = p_uty(), + # system + max_nthread = p_int(lower = 1L, default = 1L), + max_memory = p_int(lower = 1L, default = 32000L), + # large data + large_data_size = p_int(lower = 1L, default = 1e6), + large_data_learner_ids = p_uty(), + large_data_nthread = p_int(lower = 1L, default = 4L), + # small data + small_data_size = p_int(lower = 1L, default = 5000L), + small_data_resampling = p_uty(), + max_cardinality = p_int(lower = 1L, default = 100L), + extra_trees_max_cardinality = p_int(lower = 1L, default = 40L), + # tuner + resampling = p_uty(), + terminator = p_uty(), + measure = p_uty(), + lhs_size = p_int(lower = 1L, default = 4L), + callbacks = p_uty(), + store_benchmark_result = p_lgl(default = FALSE)) + param_set$set_values( + learner_ids = c("glmnet", "kknn", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm"), + learner_timeout = 900L, + max_nthread = 1L, + max_memory = 32000L, + large_data_size = 1e6L, + large_data_learner_ids = c("ranger", "xgboost", "catboost", "extra_trees", "lightgbm"), + large_data_nthread = 4L, + small_data_size = 5000L, + small_data_resampling = rsmp("cv", folds = 10L), + max_cardinality = 100L, + extra_trees_max_cardinality = 40L, + resampling = rsmp("cv", folds = 3L), + terminator = trm("run_time", secs = 14400L), + measure = msr("regr.mse"), + lhs_size = 4L, + store_benchmark_result = FALSE) + + super$initialize( + id = id, + task_type = "regr", + param_set = param_set, + packages = c("mlr3tuning", "mlr3learners", "mlr3pipelines", "mlr3mbo", "mlr3automl", "xgboost", "catboost", "lightgbm", "ranger", "nnet", "kknn", "glmnet", "e1071"), + feature_types = c("logical", "integer", "numeric", "character", "factor"), + predict_types = "response", + properties = c("missings", "weights") + ) } ), private = list( .train = function(task) { + pv = self$param_set$values + learner_ids = pv$learner_ids + self$graph = build_graph(learner_ids, task_type = "regr") + self$tuning_space = tuning_space[learner_ids] + + lg$debug("Training '%s' on task '%s'", self$id, task$id) + + # initialize mbo tuner + tuner = tnr("adbo") + + # remove learner based on memory limit + lg$debug("Starting to select from %i learners: %s", length(learner_ids), paste0(learner_ids, collapse = ",")) + + if (!is.null(pv$max_memory)) { + memory_usage = map_dbl(learner_ids, function(learner_id) { + self$graph$pipeops[[learner_id]]$learner$estimate_memory_usage(task) / 1e6 + }) + learner_ids = learner_ids[memory_usage < pv$max_memory] + lg$debug("Checking learners for memory limit of %i MB. Keeping %i learner(s): %s", pv$max_memory, length(learner_ids), paste0(learner_ids, collapse = ",")) + } + + # set number of threads + if (!is.null(pv$max_nthread)) { + lg$debug("Setting number of threads per learner to %i", pv$max_nthread) + walk(learner_ids, function(learner_id) { + set_threads(self$graph$pipeops[[learner_id]]$learner, pv$max_nthread) + }) + } + + # reduce number of workers on large data sets + if (!is.null(pv$large_data_size) && task$nrow * task$ncol > pv$large_data_size) { + lg$debug("Task size larger than %i rows", pv$large_data_size) + + learner_ids = intersect(learner_ids, pv$large_data_learner_ids) + self$tuning_space = tuning_space[learner_ids] + lg$debug("Keeping %i learner(s): %s", length(learner_ids), paste0(learner_ids, collapse = ",")) + + lg$debug("Increasing number of threads per learner to %i", pv$large_data_nthread) + walk(learner_ids, function(learner_id) { + set_threads(self$graph$pipeops[[learner_id]]$learner, pv$large_data_nthread) + }) + n_workers = rush_config()$n_workers + n = max(1, floor(n_workers / pv$large_data_nthread)) + tuner$param_set$set_values(n_workers = n) + lg$debug("Reducing number of workers to %i", n) + } + + # small data resampling + resampling = if (!is.null(pv$small_data_size) && task$nrow < pv$small_data_size) { + lg$debug("Task has less than %i rows", pv$small_data_size) + lg$debug("Using small data set resampling with %i iterations", pv$small_data_resampling$iters) + pv$small_data_resampling + } else { + pv$resampling + } + + # cardinality + cardinality = map_int(task$col_info$levels, length) + if (!is.null(pv$max_cardinality) && any(cardinality > pv$max_cardinality)) { + lg$debug("Reducing number of factor levels to %i", pv$max_cardinality) + # collapse factors + pipeop_ids = names(self$graph$pipeops) + pipeop_ids = pipeop_ids[grep("collapse", pipeop_ids)] + walk(pipeop_ids, function(pipeop_id) { + self$graph$pipeops[[pipeop_id]]$param_set$values$target_level_count = pv$max_cardinality + }) + } + + if ("extra_trees" %in% learner_ids && any(cardinality > pv$extra_trees_max_cardinality)) { + lg$debug("Reducing number of factor levels to %i for extra trees", pv$extra_trees_max_cardinality) + self$graph$pipeops$extra_trees_collapse$param_set$values$target_level_count = pv$extra_trees_max_cardinality + } + + # initialize graph learner + graph_learner = as_learner(self$graph) + graph_learner$id = "graph_learner" + graph_learner$predict_type = pv$measure$predict_type + graph_learner$fallback = lrn("regr.featureless", predict_type = pv$measure$predict_type) + graph_learner$encapsulate = c(train = "callr", predict = "callr") + graph_learner$timeout = c(train = pv$learner_timeout, predict = pv$learner_timeout) + + learners_with_validation = intersect(learner_ids, c("xgboost", "catboost", "lightgbm")) + if (length(learners_with_validation)) { + set_validate(graph_learner, "test", ids = learners_with_validation) + } + + # set early stopping + if ("xgboost" %in% learner_ids) { + graph_learner$param_set$values$xgboost.callbacks = list(cb_timeout_xgboost(pv$learner_timeout * 0.8)) + graph_learner$param_set$values$xgboost.eval_metric = pv$xgboost_eval_metric + } + if ("catboost" %in% learner_ids) { + graph_learner$param_set$values$catboost.eval_metric = pv$catboost_eval_metric + } + if ("lightgbm" %in% learner_ids) { + graph_learner$param_set$values$lightgbm.callbacks = list(cb_timeout_lightgbm(pv$learner_timeout * 0.8)) + graph_learner$param_set$values$lightgbm.eval = pv$lightgbm_eval_metric + } + + # initialize search space + tuning_space = unlist(unname(self$tuning_space), recursive = FALSE) + graph_scratch = graph_learner$clone(deep = TRUE) + graph_scratch$param_set$set_values(.values = tuning_space) + graph_scratch$param_set$set_values(branch.selection = to_tune(learner_ids)) + search_space = graph_scratch$param_set$search_space() + walk(learner_ids, function(learner_id) { + param_ids = search_space$ids() + param_ids = grep(paste0("^", learner_id), param_ids, value = TRUE) + walk(param_ids, function(param_id) { + # skip internal tuning parameter + if (param_id %in% c("xgboost.nrounds", "catboost.iterations", "lightgbm.num_iterations")) return() + search_space$add_dep( + id = param_id, + on = "branch.selection", + cond = CondEqual$new(learner_id) + ) + }) + }) + + # initial design + lhs_xdt = generate_lhs_design(pv$lhs_size, self$task_type, setdiff(learner_ids, c("lda", "extra_trees")), self$tuning_space) + default_xdt = generate_default_design(self$task_type, learner_ids, task, self$tuning_space) + initial_xdt = rbindlist(list(lhs_xdt, default_xdt), use.names = TRUE, fill = TRUE) + setorderv(initial_xdt, "branch.selection") + tuner$param_set$set_values(initial_design = initial_xdt) + + # initialize auto tuner + self$instance = ti_async( + task = task, + learner = graph_learner, + resampling = resampling, + measures = pv$measure, + terminator = pv$terminator, + search_space = search_space, + callbacks = pv$callbacks, + store_benchmark_result = pv$store_benchmark_result + ) + + # tune + lg$debug("Learner '%s' starts tuning phase", self$id) + tuner$optimize(self$instance) + + # fit final model + lg$debug("Learner '%s' fits final model", self$id) + if (length(learners_with_validation)) { + set_validate(graph_learner, NULL, ids = intersect(learner_ids, c("xgboost", "catboost", "lightgbm"))) + } + graph_learner$param_set$set_values(.values = self$instance$result_learner_param_vals, .insert = FALSE) + graph_learner$timeout = c(train = Inf, predict = Inf) + graph_learner$train(task) + + list(graph_learner = graph_learner, instance = self$instance) }, .predict = function(task) { - + lg$debug("Predicting with '%s' on task '%s'", self$id, task$id) + self$model$graph_learner$predict(task) } ) ) diff --git a/R/helper.R b/R/helper.R index 4b27c1c..8d4ff40 100644 --- a/R/helper.R +++ b/R/helper.R @@ -1,3 +1,142 @@ +build_graph = function(learner_ids, task_type) { + assert_choice(task_type, c("classif", "regr")) + learners_reg = c("glmnet", "kknn", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm") + if (task_type == "regr") { + assert_subset(learner_ids, learners_reg) + } else { + assert_subset(learner_ids, c(learners_reg, "lda")) + } + + branches = list() + # glmnet + if ("glmnet" %in% learner_ids) { + branch_glmnet = po("removeconstants", id = "glmnet_removeconstants") %>>% + po("imputehist", id = "glmnet_imputehist") %>>% + po("imputeoor", id = "glmnet_imputeoor") %>>% + po("fixfactors", id = "glmnet_fixfactors") %>>% + po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "glmnet_imputesample") %>>% + po("collapsefactors", target_level_count = 100, id = "glmnet_collapse") %>>% + po("encode", method = "one-hot", id = "glmnet_encode") %>>% + po("removeconstants", id = "glmnet_post_removeconstants") %>>% + lrn(paste0(task_type, ".glmnet"), id = "glmnet") + branches = c(branches, branch_glmnet) + } + + # kknn + if ("kknn" %in% learner_ids) { + branch_kknn = po("removeconstants", id = "kknn_removeconstants") %>>% + po("imputehist", id = "kknn_imputehist") %>>% + po("imputeoor", id = "kknn_imputeoor") %>>% + po("fixfactors", id = "kknn_fixfactors") %>>% + po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "kknn_imputesample") %>>% + po("collapsefactors", target_level_count = 100, id = "kknn_collapse") %>>% + po("removeconstants", id = "kknn_post_removeconstants") %>>% + lrn(paste0(task_type, ".kknn"), id = "kknn") + branches = c(branches, branch_kknn) + } + + # lda + # only for classification + if ("lda" %in% learner_ids) { + branch_lda = po("removeconstants", id = "lda_removeconstants") %>>% + po("imputehist", id = "lda_imputehist") %>>% + po("imputeoor", id = "lda_imputeoor") %>>% + po("fixfactors", id = "lda_fixfactors") %>>% + po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "lda_imputesample") %>>% + po("collapsefactors", target_level_count = 100, id = "lda_collapse") %>>% + po("removeconstants", id = "lda_post_removeconstants") %>>% + lrn("classif.lda", id = "lda") + branches = c(branches, branch_lda) + } + + # nnet + if ("nnet" %in% learner_ids) { + branch_nnet = po("removeconstants", id = "nnet_removeconstants") %>>% + po("imputehist", id = "nnet_imputehist") %>>% + po("imputeoor", id = "nnet_imputeoor") %>>% + po("fixfactors", id = "nnet_fixfactors") %>>% + po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "nnet_imputesample") %>>% + po("collapsefactors", target_level_count = 100, id = "nnet_collapse") %>>% + po("removeconstants", id = "nnet_post_removeconstants") %>>% + lrn(paste0(task_type, ".nnet"), id = "nnet") + branches = c(branches, branch_nnet) + } + + # ranger + if ("ranger" %in% learner_ids) { + branch_ranger = po("removeconstants", id = "ranger_removeconstants") %>>% + po("imputeoor", id = "ranger_imputeoor") %>>% + po("fixfactors", id = "ranger_fixfactors") %>>% + po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "ranger_imputesample") %>>% + po("collapsefactors", target_level_count = 100, id = "ranger_collapse") %>>% + po("removeconstants", id = "ranger_post_removeconstants") %>>% + # use upper bound of search space for memory estimation + lrn(paste0(task_type, ".ranger"), id = "ranger", num.trees = 2000) + branches = c(branches, branch_ranger) + } + + # svm + if ("svm" %in% learner_ids) { + svm_type = if (task_type == "classif") { + "C-classification" + } else { + "eps-regression" + } + branch_svm = po("removeconstants", id = "svm_removeconstants") %>>% + po("imputehist", id = "svm_imputehist") %>>% + po("imputeoor", id = "svm_imputeoor") %>>% + po("fixfactors", id = "svm_fixfactors") %>>% + po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "svm_imputesample") %>>% + po("collapsefactors", target_level_count = 100, id = "svm_collapse") %>>% + po("encode", method = "one-hot", id = "svm_encode") %>>% + po("removeconstants", id = "svm_post_removeconstants") %>>% + lrn(paste0(task_type, ".svm"), id = "svm", type = svm_type) + branches = c(branches, branch_svm) + } + + # xgboost + if ("xgboost" %in% learner_ids) { + branch_xgboost = po("removeconstants", id = "xgboost_removeconstants") %>>% + po("imputeoor", id = "xgboost_imputeoor") %>>% + po("fixfactors", id = "xgboost_fixfactors") %>>% + po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "xgboost_imputesample") %>>% + po("encodeimpact", id = "xgboost_encode") %>>% + po("removeconstants", id = "xgboost_post_removeconstants") %>>% + lrn(paste0(task_type, ".xgboost"), id = "xgboost", nrounds = 5000, early_stopping_rounds = 10) + branches = c(branches, branch_xgboost) + } + + # catboost + if ("catboost" %in% learner_ids) { + branch_catboost = po("colapply", id = "catboost_colapply", applicator = as.numeric, affect_columns = selector_type("integer")) %>>% + lrn(paste0(task_type, ".catboost"), id = "catboost", iterations = 500, early_stopping_rounds = 10, use_best_model = TRUE) + branches = c(branches, branch_catboost) + } + + # extra trees + if ("extra_trees" %in% learner_ids) { + branch_extra_trees = po("removeconstants", id = "extra_trees_removeconstants") %>>% + po("imputeoor", id = "extra_trees_imputeoor") %>>% + po("fixfactors", id = "extra_trees_fixfactors") %>>% + po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "extra_trees_imputesample") %>>% + po("collapsefactors", target_level_count = 40, id = "extra_trees_collapse") %>>% + po("removeconstants", id = "extra_trees_post_removeconstants") %>>% + lrn(paste0(task_type, ".ranger"), id = "extra_trees", splitrule = "extratrees", num.trees = 100, replace = FALSE, sample.fraction = 1) + branches = c(branches, branch_extra_trees) + } + + # lightgbm + if ("lightgbm" %in% learner_ids) { + branch_lightgbm = lrn(paste0(task_type, ".lightgbm"), id = "lightgbm", num_iterations = 5000, early_stopping_rounds = 10) + branches = c(branches, branch_lightgbm) + } + + # branch graph + po("branch", options = learner_ids) %>>% + gunion(branches) %>>% + po("unbranch", options = learner_ids) +} + generate_default_design = function(task_type, learner_ids, task, tuning_space, branch = TRUE) { map_dtr(learner_ids, function(learner_id) { if (paste0(task_type, ".", learner_id) %nin% mlr_learners$keys()) { From 2c99b98d2051ce4fbb27257a772f644a8d0ffa70 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Fri, 26 Jul 2024 11:02:45 +0200 Subject: [PATCH 03/34] test: extra trees and eval metrics --- tests/testthat/test_LearnerRegrAuto.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test_LearnerRegrAuto.R b/tests/testthat/test_LearnerRegrAuto.R index 2bcf1e6..4b274cc 100644 --- a/tests/testthat/test_LearnerRegrAuto.R +++ b/tests/testthat/test_LearnerRegrAuto.R @@ -148,7 +148,7 @@ test_that("extra_trees and glmnet works (regr)", { ) expect_class(learner$train(task), "LearnerRegrAuto") - expect_equal(learner$model$instance$result$branch.selection, "extra_trees") + expect_choice(learner$model$instance$result$branch.selection, c("extra_trees", "glmnet")) }) test_that("lightgbm works (regr)", { @@ -175,9 +175,9 @@ test_that("xgboost, catboost and lightgbm work (regr)", { task = tsk("boston_housing") learner = lrn("regr.auto", learner_ids = c("xgboost", "catboost", "lightgbm"), - catboost_eval_metric = "MultiClass", - lightgbm_eval_metric = "multi_logloss", - xgboost_eval_metric = "mlogloss", + # catboost_eval_metric = "MultiClass", + # lightgbm_eval_metric = "multi_logloss", + # xgboost_eval_metric = "mlogloss", resampling = rsmp("holdout"), lhs_size = 1, measure = msr("regr.mse"), From 4bb4d0c716da7398985283d61fb209c6fa07c057 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Fri, 26 Jul 2024 11:14:38 +0200 Subject: [PATCH 04/34] fix: remove lda --- R/LearnerRegrAuto.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/LearnerRegrAuto.R b/R/LearnerRegrAuto.R index 9db316b..20ccf50 100644 --- a/R/LearnerRegrAuto.R +++ b/R/LearnerRegrAuto.R @@ -25,12 +25,12 @@ LearnerRegrAuto = R6Class("LearnerRegrAuto", initialize = function(id = "classif.auto") { param_set = ps( # learner - learner_ids = p_uty(default = c("glmnet", "kknn", "lda", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm"), + learner_ids = p_uty(default = c("glmnet", "kknn", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm"), custom_check = function(x) { - if (all(x %in% c("lda", "extra_trees"))) { - return("Learner 'lda' and 'extra_trees' must be combined with other learners") + if (length(x) == 1 && x == "extra_trees") { + return("Learner 'extra_trees' must be combined with other learners") } - check_subset(x, c("glmnet", "kknn", "lda", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm")) + check_subset(x, c("glmnet", "kknn", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm")) }), learner_timeout = p_int(lower = 1L, default = 900L), xgboost_eval_metric = p_uty(), @@ -208,7 +208,7 @@ LearnerRegrAuto = R6Class("LearnerRegrAuto", }) # initial design - lhs_xdt = generate_lhs_design(pv$lhs_size, self$task_type, setdiff(learner_ids, c("lda", "extra_trees")), self$tuning_space) + lhs_xdt = generate_lhs_design(pv$lhs_size, self$task_type, setdiff(learner_ids, "extra_trees"), self$tuning_space) default_xdt = generate_default_design(self$task_type, learner_ids, task, self$tuning_space) initial_xdt = rbindlist(list(lhs_xdt, default_xdt), use.names = TRUE, fill = TRUE) setorderv(initial_xdt, "branch.selection") From 823bced553f0f2e63211518b4d1660e83995745b Mon Sep 17 00:00:00 2001 From: b-zhou Date: Fri, 26 Jul 2024 12:15:56 +0200 Subject: [PATCH 05/34] refactor: build_graph --- R/build_graph.R | 202 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100644 R/build_graph.R diff --git a/R/build_graph.R b/R/build_graph.R new file mode 100644 index 0000000..c863ba6 --- /dev/null +++ b/R/build_graph.R @@ -0,0 +1,202 @@ +build_graph = function(learner_ids, task_type) { + assert_choice(task_type, c("classif", "regr")) + learners_reg = c("glmnet", "kknn", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm") + if (task_type == "regr") { + assert_subset(learner_ids, learners_reg) + } else { + assert_subset(learner_ids, c(learners_reg, "lda")) + } + + branches = list() + # glmnet + if ("glmnet" %in% learner_ids) { + branch_glmnet = po("removeconstants", id = "glmnet_removeconstants") %>>% + po("imputehist", id = "glmnet_imputehist") %>>% + po("imputeoor", id = "glmnet_imputeoor") %>>% + po("fixfactors", id = "glmnet_fixfactors") %>>% + po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "glmnet_imputesample") %>>% + po("collapsefactors", target_level_count = 100, id = "glmnet_collapse") %>>% + po("encode", method = "one-hot", id = "glmnet_encode") %>>% + po("removeconstants", id = "glmnet_post_removeconstants") %>>% + lrn(paste0(task_type, ".glmnet"), id = "glmnet") + branches = c(branches, branch_glmnet) + } + + # kknn + if ("kknn" %in% learner_ids) { + branch_kknn = po("removeconstants", id = "kknn_removeconstants") %>>% + po("imputehist", id = "kknn_imputehist") %>>% + po("imputeoor", id = "kknn_imputeoor") %>>% + po("fixfactors", id = "kknn_fixfactors") %>>% + po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "kknn_imputesample") %>>% + po("collapsefactors", target_level_count = 100, id = "kknn_collapse") %>>% + po("removeconstants", id = "kknn_post_removeconstants") %>>% + lrn(paste0(task_type, ".kknn"), id = "kknn") + branches = c(branches, branch_kknn) + } + + # lda + # only for classification + if ("lda" %in% learner_ids) { + branch_lda = po("removeconstants", id = "lda_removeconstants") %>>% + po("imputehist", id = "lda_imputehist") %>>% + po("imputeoor", id = "lda_imputeoor") %>>% + po("fixfactors", id = "lda_fixfactors") %>>% + po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "lda_imputesample") %>>% + po("collapsefactors", target_level_count = 100, id = "lda_collapse") %>>% + po("removeconstants", id = "lda_post_removeconstants") %>>% + lrn("classif.lda", id = "lda") + branches = c(branches, branch_lda) + } + + # nnet + if ("nnet" %in% learner_ids) { + branch_nnet = po("removeconstants", id = "nnet_removeconstants") %>>% + po("imputehist", id = "nnet_imputehist") %>>% + po("imputeoor", id = "nnet_imputeoor") %>>% + po("fixfactors", id = "nnet_fixfactors") %>>% + po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "nnet_imputesample") %>>% + po("collapsefactors", target_level_count = 100, id = "nnet_collapse") %>>% + po("removeconstants", id = "nnet_post_removeconstants") %>>% + lrn(paste0(task_type, ".nnet"), id = "nnet") + branches = c(branches, branch_nnet) + } + + # ranger + if ("ranger" %in% learner_ids) { + branch_ranger = po("removeconstants", id = "ranger_removeconstants") %>>% + po("imputeoor", id = "ranger_imputeoor") %>>% + po("fixfactors", id = "ranger_fixfactors") %>>% + po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "ranger_imputesample") %>>% + po("collapsefactors", target_level_count = 100, id = "ranger_collapse") %>>% + po("removeconstants", id = "ranger_post_removeconstants") %>>% + # use upper bound of search space for memory estimation + lrn(paste0(task_type, ".ranger"), id = "ranger", num.trees = 2000) + branches = c(branches, branch_ranger) + } + + # svm + if ("svm" %in% learner_ids) { + svm_type = if (task_type == "classif") { + "C-classification" + } else { + "eps-regression" + } + branch_svm = po("removeconstants", id = "svm_removeconstants") %>>% + po("imputehist", id = "svm_imputehist") %>>% + po("imputeoor", id = "svm_imputeoor") %>>% + po("fixfactors", id = "svm_fixfactors") %>>% + po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "svm_imputesample") %>>% + po("collapsefactors", target_level_count = 100, id = "svm_collapse") %>>% + po("encode", method = "one-hot", id = "svm_encode") %>>% + po("removeconstants", id = "svm_post_removeconstants") %>>% + lrn(paste0(task_type, ".svm"), id = "svm", type = svm_type) + branches = c(branches, branch_svm) + } + + # xgboost + if ("xgboost" %in% learner_ids) { + branch_xgboost = po("removeconstants", id = "xgboost_removeconstants") %>>% + po("imputeoor", id = "xgboost_imputeoor") %>>% + po("fixfactors", id = "xgboost_fixfactors") %>>% + po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "xgboost_imputesample") %>>% + po("encodeimpact", id = "xgboost_encode") %>>% + po("removeconstants", id = "xgboost_post_removeconstants") %>>% + lrn(paste0(task_type, ".xgboost"), id = "xgboost", nrounds = 5000, early_stopping_rounds = 10) + branches = c(branches, branch_xgboost) + } + + # catboost + if ("catboost" %in% learner_ids) { + branch_catboost = po("colapply", id = "catboost_colapply", applicator = as.numeric, affect_columns = selector_type("integer")) %>>% + lrn(paste0(task_type, ".catboost"), id = "catboost", iterations = 500, early_stopping_rounds = 10, use_best_model = TRUE) + branches = c(branches, branch_catboost) + } + + # extra trees + if ("extra_trees" %in% learner_ids) { + branch_extra_trees = po("removeconstants", id = "extra_trees_removeconstants") %>>% + po("imputeoor", id = "extra_trees_imputeoor") %>>% + po("fixfactors", id = "extra_trees_fixfactors") %>>% + po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "extra_trees_imputesample") %>>% + po("collapsefactors", target_level_count = 40, id = "extra_trees_collapse") %>>% + po("removeconstants", id = "extra_trees_post_removeconstants") %>>% + lrn(paste0(task_type, ".ranger"), id = "extra_trees", splitrule = "extratrees", num.trees = 100, replace = FALSE, sample.fraction = 1) + branches = c(branches, branch_extra_trees) + } + + # lightgbm + if ("lightgbm" %in% learner_ids) { + branch_lightgbm = lrn(paste0(task_type, ".lightgbm"), id = "lightgbm", num_iterations = 5000, early_stopping_rounds = 10) + branches = c(branches, branch_lightgbm) + } + + # branch graph + po("branch", options = learner_ids) %>>% + gunion(branches) %>>% + po("unbranch", options = learner_ids) +} + +tuning_space = list( + glmnet = list( + glmnet.s = to_tune(1e-4, 1e4, logscale = TRUE), + glmnet.alpha = to_tune(0, 1) + ), + + kknn = list( + kknn.k = to_tune(1, 50, logscale = TRUE), + kknn.distance = to_tune(1, 5), + kknn.kernel = to_tune(c("rectangular", "optimal", "epanechnikov", "biweight", "triweight", "cos", "inv", "gaussian", "rank")) + ), + + lda = list(), + + extra_trees = list(), + + nnet = list( + nnet.maxit = to_tune(1e1, 1e3, logscale = TRUE), + nnet.decay = to_tune(1e-4, 1e-1, logscale = TRUE), + nnet.size = to_tune(2, 50, logscale = TRUE) + ), + + ranger = list( + ranger.mtry.ratio = to_tune(0, 1), + ranger.replace = to_tune(), + ranger.sample.fraction = to_tune(1e-1, 1), + ranger.num.trees = to_tune(500, 2000) + ), + + svm = list( + svm.cost = to_tune(1e-4, 1e4, logscale = TRUE), + svm.kernel = to_tune(c("polynomial", "radial", "sigmoid", "linear")), + svm.degree = to_tune(2, 5), + svm.gamma = to_tune(1e-4, 1e4, logscale = TRUE) + ), + + xgboost = list( + xgboost.eta = to_tune(1e-4, 1, logscale = TRUE), + xgboost.max_depth = to_tune(1, 20), + xgboost.colsample_bytree = to_tune(1e-1, 1), + xgboost.colsample_bylevel = to_tune(1e-1, 1), + xgboost.lambda = to_tune(1e-3, 1e3, logscale = TRUE), + xgboost.alpha = to_tune(1e-3, 1e3, logscale = TRUE), + xgboost.subsample = to_tune(1e-1, 1), + xgboost.nrounds = to_tune(1, 5000, internal = TRUE) + ), + + catboost = list( + catboost.depth = to_tune(5, 8), + catboost.learning_rate = to_tune(5e-3, 0.2, logscale = TRUE), + catboost.l2_leaf_reg = to_tune(1, 5), + catboost.iterations = to_tune(1, 500, internal = TRUE) + ), + + + lightgbm = list( + lightgbm.learning_rate = to_tune(5e-3, 0.2, logscale = TRUE), + lightgbm.feature_fraction = to_tune(0.75, 1), + lightgbm.min_data_in_leaf = to_tune(2, 60), + lightgbm.num_leaves = to_tune(16, 96), + lightgbm.num_iterations = to_tune(1, 5000, internal = TRUE) + ) +) \ No newline at end of file From 29271640423914d4330c4391033aa538b08808ff Mon Sep 17 00:00:00 2001 From: b-zhou Date: Fri, 26 Jul 2024 12:15:44 +0200 Subject: [PATCH 06/34] refactor: train --- DESCRIPTION | 2 + R/LearnerClassifAuto.R | 218 +---------------------------------------- R/LearnerRegrAuto.R | 154 +---------------------------- R/helper.R | 139 -------------------------- R/train_auto.R | 155 +++++++++++++++++++++++++++++ 5 files changed, 159 insertions(+), 509 deletions(-) create mode 100644 R/train_auto.R diff --git a/DESCRIPTION b/DESCRIPTION index 4ab70c0..dfc53bb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -62,4 +62,6 @@ Collate: 'LearnerClassifAutoXgboost.R' 'LearnerRegrAuto.R' 'helper.R' + 'build_graph.R' + 'train_auto.R' 'zzz.R' diff --git a/R/LearnerClassifAuto.R b/R/LearnerClassifAuto.R index 132ab31..1d740b0 100644 --- a/R/LearnerClassifAuto.R +++ b/R/LearnerClassifAuto.R @@ -87,159 +87,7 @@ LearnerClassifAuto = R6Class("LearnerClassifAuto", private = list( .train = function(task) { - pv = self$param_set$values - learner_ids = pv$learner_ids - self$graph = build_graph(learner_ids, task_type = "classif") - self$tuning_space = tuning_space[learner_ids] - - lg$debug("Training '%s' on task '%s'", self$id, task$id) - - # initialize mbo tuner - tuner = tnr("adbo") - - # remove learner based on memory limit - lg$debug("Starting to select from %i learners: %s", length(learner_ids), paste0(learner_ids, collapse = ",")) - - if (!is.null(pv$max_memory)) { - memory_usage = map_dbl(learner_ids, function(learner_id) { - self$graph$pipeops[[learner_id]]$learner$estimate_memory_usage(task) / 1e6 - }) - learner_ids = learner_ids[memory_usage < pv$max_memory] - lg$debug("Checking learners for memory limit of %i MB. Keeping %i learner(s): %s", pv$max_memory, length(learner_ids), paste0(learner_ids, collapse = ",")) - } - - # set number of threads - if (!is.null(pv$max_nthread)) { - lg$debug("Setting number of threads per learner to %i", pv$max_nthread) - walk(learner_ids, function(learner_id) { - set_threads(self$graph$pipeops[[learner_id]]$learner, pv$max_nthread) - }) - } - - # reduce number of workers on large data sets - if (!is.null(pv$large_data_size) && task$nrow * task$ncol > pv$large_data_size) { - lg$debug("Task size larger than %i rows", pv$large_data_size) - - learner_ids = intersect(learner_ids, pv$large_data_learner_ids) - self$tuning_space = tuning_space[learner_ids] - lg$debug("Keeping %i learner(s): %s", length(learner_ids), paste0(learner_ids, collapse = ",")) - - lg$debug("Increasing number of threads per learner to %i", pv$large_data_nthread) - walk(learner_ids, function(learner_id) { - set_threads(self$graph$pipeops[[learner_id]]$learner, pv$large_data_nthread) - }) - n_workers = rush_config()$n_workers - n = max(1, floor(n_workers / pv$large_data_nthread)) - tuner$param_set$set_values(n_workers = n) - lg$debug("Reducing number of workers to %i", n) - } - - # small data resampling - resampling = if (!is.null(pv$small_data_size) && task$nrow < pv$small_data_size) { - lg$debug("Task has less than %i rows", pv$small_data_size) - lg$debug("Using small data set resampling with %i iterations", pv$small_data_resampling$iters) - pv$small_data_resampling - } else { - pv$resampling - } - - # cardinality - cardinality = map_int(task$col_info$levels, length) - if (!is.null(pv$max_cardinality) && any(cardinality > pv$max_cardinality)) { - lg$debug("Reducing number of factor levels to %i", pv$max_cardinality) - - # collapse factors - pipeop_ids = names(self$graph$pipeops) - pipeop_ids = pipeop_ids[grep("collapse", pipeop_ids)] - walk(pipeop_ids, function(pipeop_id) { - self$graph$pipeops[[pipeop_id]]$param_set$values$target_level_count = pv$max_cardinality - }) - } - - if ("extra_trees" %in% learner_ids && any(cardinality > pv$extra_trees_max_cardinality)) { - lg$debug("Reducing number of factor levels to %i for extra trees", pv$extra_trees_max_cardinality) - self$graph$pipeops$extra_trees_collapse$param_set$values$target_level_count = pv$extra_trees_max_cardinality - } - - # initialize graph learner - graph_learner = as_learner(self$graph) - graph_learner$id = "graph_learner" - graph_learner$predict_type = pv$measure$predict_type - graph_learner$fallback = lrn("classif.featureless", predict_type = pv$measure$predict_type) - graph_learner$encapsulate = c(train = "callr", predict = "callr") - graph_learner$timeout = c(train = pv$learner_timeout, predict = pv$learner_timeout) - - learners_with_validation = intersect(learner_ids, c("xgboost", "catboost", "lightgbm")) - if (length(learners_with_validation)) { - set_validate(graph_learner, "test", ids = learners_with_validation) - } - - # set early stopping - if ("xgboost" %in% learner_ids) { - graph_learner$param_set$values$xgboost.callbacks = list(cb_timeout_xgboost(pv$learner_timeout * 0.8)) - graph_learner$param_set$values$xgboost.eval_metric = pv$xgboost_eval_metric - } - if ("catboost" %in% learner_ids) { - graph_learner$param_set$values$catboost.eval_metric = pv$catboost_eval_metric - } - if ("lightgbm" %in% learner_ids) { - graph_learner$param_set$values$lightgbm.callbacks = list(cb_timeout_lightgbm(pv$learner_timeout * 0.8)) - graph_learner$param_set$values$lightgbm.eval = pv$lightgbm_eval_metric - } - - # initialize search space - tuning_space = unlist(unname(self$tuning_space), recursive = FALSE) - graph_scratch = graph_learner$clone(deep = TRUE) - graph_scratch$param_set$set_values(.values = tuning_space) - graph_scratch$param_set$set_values(branch.selection = to_tune(learner_ids)) - search_space = graph_scratch$param_set$search_space() - walk(learner_ids, function(learner_id) { - param_ids = search_space$ids() - param_ids = grep(paste0("^", learner_id), param_ids, value = TRUE) - walk(param_ids, function(param_id) { - # skip internal tuning parameter - if (param_id %in% c("xgboost.nrounds", "catboost.iterations", "lightgbm.num_iterations")) return() - search_space$add_dep( - id = param_id, - on = "branch.selection", - cond = CondEqual$new(learner_id) - ) - }) - }) - - # initial design - lhs_xdt = generate_lhs_design(pv$lhs_size, self$task_type, setdiff(learner_ids, c("lda", "extra_trees")), self$tuning_space) - default_xdt = generate_default_design(self$task_type, learner_ids, task, self$tuning_space) - initial_xdt = rbindlist(list(lhs_xdt, default_xdt), use.names = TRUE, fill = TRUE) - setorderv(initial_xdt, "branch.selection") - tuner$param_set$set_values(initial_design = initial_xdt) - - # initialize auto tuner - self$instance = ti_async( - task = task, - learner = graph_learner, - resampling = resampling, - measures = pv$measure, - terminator = pv$terminator, - search_space = search_space, - callbacks = pv$callbacks, - store_benchmark_result = pv$store_benchmark_result - ) - - # tune - lg$debug("Learner '%s' starts tuning phase", self$id) - tuner$optimize(self$instance) - - # fit final model - lg$debug("Learner '%s' fits final model", self$id) - if (length(learners_with_validation)) { - set_validate(graph_learner, NULL, ids = intersect(learner_ids, c("xgboost", "catboost", "lightgbm"))) - } - graph_learner$param_set$set_values(.values = self$instance$result_learner_param_vals, .insert = FALSE) - graph_learner$timeout = c(train = Inf, predict = Inf) - graph_learner$train(task) - - list(graph_learner = graph_learner, instance = self$instance) + train_auto(self, task, task_type = "classif") }, .predict = function(task) { @@ -251,67 +99,3 @@ LearnerClassifAuto = R6Class("LearnerClassifAuto", #' @include aaa.R learners[["classif.auto"]] = LearnerClassifAuto - -tuning_space = list( - glmnet = list( - glmnet.s = to_tune(1e-4, 1e4, logscale = TRUE), - glmnet.alpha = to_tune(0, 1) - ), - - kknn = list( - kknn.k = to_tune(1, 50, logscale = TRUE), - kknn.distance = to_tune(1, 5), - kknn.kernel = to_tune(c("rectangular", "optimal", "epanechnikov", "biweight", "triweight", "cos", "inv", "gaussian", "rank")) - ), - - lda = list(), - - extra_trees = list(), - - nnet = list( - nnet.maxit = to_tune(1e1, 1e3, logscale = TRUE), - nnet.decay = to_tune(1e-4, 1e-1, logscale = TRUE), - nnet.size = to_tune(2, 50, logscale = TRUE) - ), - - ranger = list( - ranger.mtry.ratio = to_tune(0, 1), - ranger.replace = to_tune(), - ranger.sample.fraction = to_tune(1e-1, 1), - ranger.num.trees = to_tune(500, 2000) - ), - - svm = list( - svm.cost = to_tune(1e-4, 1e4, logscale = TRUE), - svm.kernel = to_tune(c("polynomial", "radial", "sigmoid", "linear")), - svm.degree = to_tune(2, 5), - svm.gamma = to_tune(1e-4, 1e4, logscale = TRUE) - ), - - xgboost = list( - xgboost.eta = to_tune(1e-4, 1, logscale = TRUE), - xgboost.max_depth = to_tune(1, 20), - xgboost.colsample_bytree = to_tune(1e-1, 1), - xgboost.colsample_bylevel = to_tune(1e-1, 1), - xgboost.lambda = to_tune(1e-3, 1e3, logscale = TRUE), - xgboost.alpha = to_tune(1e-3, 1e3, logscale = TRUE), - xgboost.subsample = to_tune(1e-1, 1), - xgboost.nrounds = to_tune(1, 5000, internal = TRUE) - ), - - catboost = list( - catboost.depth = to_tune(5, 8), - catboost.learning_rate = to_tune(5e-3, 0.2, logscale = TRUE), - catboost.l2_leaf_reg = to_tune(1, 5), - catboost.iterations = to_tune(1, 500, internal = TRUE) - ), - - - lightgbm = list( - lightgbm.learning_rate = to_tune(5e-3, 0.2, logscale = TRUE), - lightgbm.feature_fraction = to_tune(0.75, 1), - lightgbm.min_data_in_leaf = to_tune(2, 60), - lightgbm.num_leaves = to_tune(16, 96), - lightgbm.num_iterations = to_tune(1, 5000, internal = TRUE) - ) -) diff --git a/R/LearnerRegrAuto.R b/R/LearnerRegrAuto.R index 20ccf50..19dc8b8 100644 --- a/R/LearnerRegrAuto.R +++ b/R/LearnerRegrAuto.R @@ -87,159 +87,7 @@ LearnerRegrAuto = R6Class("LearnerRegrAuto", ), private = list( .train = function(task) { - pv = self$param_set$values - learner_ids = pv$learner_ids - self$graph = build_graph(learner_ids, task_type = "regr") - self$tuning_space = tuning_space[learner_ids] - - lg$debug("Training '%s' on task '%s'", self$id, task$id) - - # initialize mbo tuner - tuner = tnr("adbo") - - # remove learner based on memory limit - lg$debug("Starting to select from %i learners: %s", length(learner_ids), paste0(learner_ids, collapse = ",")) - - if (!is.null(pv$max_memory)) { - memory_usage = map_dbl(learner_ids, function(learner_id) { - self$graph$pipeops[[learner_id]]$learner$estimate_memory_usage(task) / 1e6 - }) - learner_ids = learner_ids[memory_usage < pv$max_memory] - lg$debug("Checking learners for memory limit of %i MB. Keeping %i learner(s): %s", pv$max_memory, length(learner_ids), paste0(learner_ids, collapse = ",")) - } - - # set number of threads - if (!is.null(pv$max_nthread)) { - lg$debug("Setting number of threads per learner to %i", pv$max_nthread) - walk(learner_ids, function(learner_id) { - set_threads(self$graph$pipeops[[learner_id]]$learner, pv$max_nthread) - }) - } - - # reduce number of workers on large data sets - if (!is.null(pv$large_data_size) && task$nrow * task$ncol > pv$large_data_size) { - lg$debug("Task size larger than %i rows", pv$large_data_size) - - learner_ids = intersect(learner_ids, pv$large_data_learner_ids) - self$tuning_space = tuning_space[learner_ids] - lg$debug("Keeping %i learner(s): %s", length(learner_ids), paste0(learner_ids, collapse = ",")) - - lg$debug("Increasing number of threads per learner to %i", pv$large_data_nthread) - walk(learner_ids, function(learner_id) { - set_threads(self$graph$pipeops[[learner_id]]$learner, pv$large_data_nthread) - }) - n_workers = rush_config()$n_workers - n = max(1, floor(n_workers / pv$large_data_nthread)) - tuner$param_set$set_values(n_workers = n) - lg$debug("Reducing number of workers to %i", n) - } - - # small data resampling - resampling = if (!is.null(pv$small_data_size) && task$nrow < pv$small_data_size) { - lg$debug("Task has less than %i rows", pv$small_data_size) - lg$debug("Using small data set resampling with %i iterations", pv$small_data_resampling$iters) - pv$small_data_resampling - } else { - pv$resampling - } - - # cardinality - cardinality = map_int(task$col_info$levels, length) - if (!is.null(pv$max_cardinality) && any(cardinality > pv$max_cardinality)) { - lg$debug("Reducing number of factor levels to %i", pv$max_cardinality) - - # collapse factors - pipeop_ids = names(self$graph$pipeops) - pipeop_ids = pipeop_ids[grep("collapse", pipeop_ids)] - walk(pipeop_ids, function(pipeop_id) { - self$graph$pipeops[[pipeop_id]]$param_set$values$target_level_count = pv$max_cardinality - }) - } - - if ("extra_trees" %in% learner_ids && any(cardinality > pv$extra_trees_max_cardinality)) { - lg$debug("Reducing number of factor levels to %i for extra trees", pv$extra_trees_max_cardinality) - self$graph$pipeops$extra_trees_collapse$param_set$values$target_level_count = pv$extra_trees_max_cardinality - } - - # initialize graph learner - graph_learner = as_learner(self$graph) - graph_learner$id = "graph_learner" - graph_learner$predict_type = pv$measure$predict_type - graph_learner$fallback = lrn("regr.featureless", predict_type = pv$measure$predict_type) - graph_learner$encapsulate = c(train = "callr", predict = "callr") - graph_learner$timeout = c(train = pv$learner_timeout, predict = pv$learner_timeout) - - learners_with_validation = intersect(learner_ids, c("xgboost", "catboost", "lightgbm")) - if (length(learners_with_validation)) { - set_validate(graph_learner, "test", ids = learners_with_validation) - } - - # set early stopping - if ("xgboost" %in% learner_ids) { - graph_learner$param_set$values$xgboost.callbacks = list(cb_timeout_xgboost(pv$learner_timeout * 0.8)) - graph_learner$param_set$values$xgboost.eval_metric = pv$xgboost_eval_metric - } - if ("catboost" %in% learner_ids) { - graph_learner$param_set$values$catboost.eval_metric = pv$catboost_eval_metric - } - if ("lightgbm" %in% learner_ids) { - graph_learner$param_set$values$lightgbm.callbacks = list(cb_timeout_lightgbm(pv$learner_timeout * 0.8)) - graph_learner$param_set$values$lightgbm.eval = pv$lightgbm_eval_metric - } - - # initialize search space - tuning_space = unlist(unname(self$tuning_space), recursive = FALSE) - graph_scratch = graph_learner$clone(deep = TRUE) - graph_scratch$param_set$set_values(.values = tuning_space) - graph_scratch$param_set$set_values(branch.selection = to_tune(learner_ids)) - search_space = graph_scratch$param_set$search_space() - walk(learner_ids, function(learner_id) { - param_ids = search_space$ids() - param_ids = grep(paste0("^", learner_id), param_ids, value = TRUE) - walk(param_ids, function(param_id) { - # skip internal tuning parameter - if (param_id %in% c("xgboost.nrounds", "catboost.iterations", "lightgbm.num_iterations")) return() - search_space$add_dep( - id = param_id, - on = "branch.selection", - cond = CondEqual$new(learner_id) - ) - }) - }) - - # initial design - lhs_xdt = generate_lhs_design(pv$lhs_size, self$task_type, setdiff(learner_ids, "extra_trees"), self$tuning_space) - default_xdt = generate_default_design(self$task_type, learner_ids, task, self$tuning_space) - initial_xdt = rbindlist(list(lhs_xdt, default_xdt), use.names = TRUE, fill = TRUE) - setorderv(initial_xdt, "branch.selection") - tuner$param_set$set_values(initial_design = initial_xdt) - - # initialize auto tuner - self$instance = ti_async( - task = task, - learner = graph_learner, - resampling = resampling, - measures = pv$measure, - terminator = pv$terminator, - search_space = search_space, - callbacks = pv$callbacks, - store_benchmark_result = pv$store_benchmark_result - ) - - # tune - lg$debug("Learner '%s' starts tuning phase", self$id) - tuner$optimize(self$instance) - - # fit final model - lg$debug("Learner '%s' fits final model", self$id) - if (length(learners_with_validation)) { - set_validate(graph_learner, NULL, ids = intersect(learner_ids, c("xgboost", "catboost", "lightgbm"))) - } - graph_learner$param_set$set_values(.values = self$instance$result_learner_param_vals, .insert = FALSE) - graph_learner$timeout = c(train = Inf, predict = Inf) - graph_learner$train(task) - - list(graph_learner = graph_learner, instance = self$instance) + train_auto(self, task, task_type = "regr") }, .predict = function(task) { diff --git a/R/helper.R b/R/helper.R index 8d4ff40..4b27c1c 100644 --- a/R/helper.R +++ b/R/helper.R @@ -1,142 +1,3 @@ -build_graph = function(learner_ids, task_type) { - assert_choice(task_type, c("classif", "regr")) - learners_reg = c("glmnet", "kknn", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm") - if (task_type == "regr") { - assert_subset(learner_ids, learners_reg) - } else { - assert_subset(learner_ids, c(learners_reg, "lda")) - } - - branches = list() - # glmnet - if ("glmnet" %in% learner_ids) { - branch_glmnet = po("removeconstants", id = "glmnet_removeconstants") %>>% - po("imputehist", id = "glmnet_imputehist") %>>% - po("imputeoor", id = "glmnet_imputeoor") %>>% - po("fixfactors", id = "glmnet_fixfactors") %>>% - po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "glmnet_imputesample") %>>% - po("collapsefactors", target_level_count = 100, id = "glmnet_collapse") %>>% - po("encode", method = "one-hot", id = "glmnet_encode") %>>% - po("removeconstants", id = "glmnet_post_removeconstants") %>>% - lrn(paste0(task_type, ".glmnet"), id = "glmnet") - branches = c(branches, branch_glmnet) - } - - # kknn - if ("kknn" %in% learner_ids) { - branch_kknn = po("removeconstants", id = "kknn_removeconstants") %>>% - po("imputehist", id = "kknn_imputehist") %>>% - po("imputeoor", id = "kknn_imputeoor") %>>% - po("fixfactors", id = "kknn_fixfactors") %>>% - po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "kknn_imputesample") %>>% - po("collapsefactors", target_level_count = 100, id = "kknn_collapse") %>>% - po("removeconstants", id = "kknn_post_removeconstants") %>>% - lrn(paste0(task_type, ".kknn"), id = "kknn") - branches = c(branches, branch_kknn) - } - - # lda - # only for classification - if ("lda" %in% learner_ids) { - branch_lda = po("removeconstants", id = "lda_removeconstants") %>>% - po("imputehist", id = "lda_imputehist") %>>% - po("imputeoor", id = "lda_imputeoor") %>>% - po("fixfactors", id = "lda_fixfactors") %>>% - po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "lda_imputesample") %>>% - po("collapsefactors", target_level_count = 100, id = "lda_collapse") %>>% - po("removeconstants", id = "lda_post_removeconstants") %>>% - lrn("classif.lda", id = "lda") - branches = c(branches, branch_lda) - } - - # nnet - if ("nnet" %in% learner_ids) { - branch_nnet = po("removeconstants", id = "nnet_removeconstants") %>>% - po("imputehist", id = "nnet_imputehist") %>>% - po("imputeoor", id = "nnet_imputeoor") %>>% - po("fixfactors", id = "nnet_fixfactors") %>>% - po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "nnet_imputesample") %>>% - po("collapsefactors", target_level_count = 100, id = "nnet_collapse") %>>% - po("removeconstants", id = "nnet_post_removeconstants") %>>% - lrn(paste0(task_type, ".nnet"), id = "nnet") - branches = c(branches, branch_nnet) - } - - # ranger - if ("ranger" %in% learner_ids) { - branch_ranger = po("removeconstants", id = "ranger_removeconstants") %>>% - po("imputeoor", id = "ranger_imputeoor") %>>% - po("fixfactors", id = "ranger_fixfactors") %>>% - po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "ranger_imputesample") %>>% - po("collapsefactors", target_level_count = 100, id = "ranger_collapse") %>>% - po("removeconstants", id = "ranger_post_removeconstants") %>>% - # use upper bound of search space for memory estimation - lrn(paste0(task_type, ".ranger"), id = "ranger", num.trees = 2000) - branches = c(branches, branch_ranger) - } - - # svm - if ("svm" %in% learner_ids) { - svm_type = if (task_type == "classif") { - "C-classification" - } else { - "eps-regression" - } - branch_svm = po("removeconstants", id = "svm_removeconstants") %>>% - po("imputehist", id = "svm_imputehist") %>>% - po("imputeoor", id = "svm_imputeoor") %>>% - po("fixfactors", id = "svm_fixfactors") %>>% - po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "svm_imputesample") %>>% - po("collapsefactors", target_level_count = 100, id = "svm_collapse") %>>% - po("encode", method = "one-hot", id = "svm_encode") %>>% - po("removeconstants", id = "svm_post_removeconstants") %>>% - lrn(paste0(task_type, ".svm"), id = "svm", type = svm_type) - branches = c(branches, branch_svm) - } - - # xgboost - if ("xgboost" %in% learner_ids) { - branch_xgboost = po("removeconstants", id = "xgboost_removeconstants") %>>% - po("imputeoor", id = "xgboost_imputeoor") %>>% - po("fixfactors", id = "xgboost_fixfactors") %>>% - po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "xgboost_imputesample") %>>% - po("encodeimpact", id = "xgboost_encode") %>>% - po("removeconstants", id = "xgboost_post_removeconstants") %>>% - lrn(paste0(task_type, ".xgboost"), id = "xgboost", nrounds = 5000, early_stopping_rounds = 10) - branches = c(branches, branch_xgboost) - } - - # catboost - if ("catboost" %in% learner_ids) { - branch_catboost = po("colapply", id = "catboost_colapply", applicator = as.numeric, affect_columns = selector_type("integer")) %>>% - lrn(paste0(task_type, ".catboost"), id = "catboost", iterations = 500, early_stopping_rounds = 10, use_best_model = TRUE) - branches = c(branches, branch_catboost) - } - - # extra trees - if ("extra_trees" %in% learner_ids) { - branch_extra_trees = po("removeconstants", id = "extra_trees_removeconstants") %>>% - po("imputeoor", id = "extra_trees_imputeoor") %>>% - po("fixfactors", id = "extra_trees_fixfactors") %>>% - po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "extra_trees_imputesample") %>>% - po("collapsefactors", target_level_count = 40, id = "extra_trees_collapse") %>>% - po("removeconstants", id = "extra_trees_post_removeconstants") %>>% - lrn(paste0(task_type, ".ranger"), id = "extra_trees", splitrule = "extratrees", num.trees = 100, replace = FALSE, sample.fraction = 1) - branches = c(branches, branch_extra_trees) - } - - # lightgbm - if ("lightgbm" %in% learner_ids) { - branch_lightgbm = lrn(paste0(task_type, ".lightgbm"), id = "lightgbm", num_iterations = 5000, early_stopping_rounds = 10) - branches = c(branches, branch_lightgbm) - } - - # branch graph - po("branch", options = learner_ids) %>>% - gunion(branches) %>>% - po("unbranch", options = learner_ids) -} - generate_default_design = function(task_type, learner_ids, task, tuning_space, branch = TRUE) { map_dtr(learner_ids, function(learner_id) { if (paste0(task_type, ".", learner_id) %nin% mlr_learners$keys()) { diff --git a/R/train_auto.R b/R/train_auto.R new file mode 100644 index 0000000..2ff174f --- /dev/null +++ b/R/train_auto.R @@ -0,0 +1,155 @@ +train_auto = function(self, task, task_type) { + pv = self$param_set$values + learner_ids = pv$learner_ids + self$graph = build_graph(learner_ids, task_type) + self$tuning_space = tuning_space[learner_ids] + + lg$debug("Training '%s' on task '%s'", self$id, task$id) + + # initialize mbo tuner + tuner = tnr("adbo") + + # remove learner based on memory limit + lg$debug("Starting to select from %i learners: %s", length(learner_ids), paste0(learner_ids, collapse = ",")) + + if (!is.null(pv$max_memory)) { + memory_usage = map_dbl(learner_ids, function(learner_id) { + self$graph$pipeops[[learner_id]]$learner$estimate_memory_usage(task) / 1e6 + }) + learner_ids = learner_ids[memory_usage < pv$max_memory] + lg$debug("Checking learners for memory limit of %i MB. Keeping %i learner(s): %s", pv$max_memory, length(learner_ids), paste0(learner_ids, collapse = ",")) + } + + # set number of threads + if (!is.null(pv$max_nthread)) { + lg$debug("Setting number of threads per learner to %i", pv$max_nthread) + walk(learner_ids, function(learner_id) { + set_threads(self$graph$pipeops[[learner_id]]$learner, pv$max_nthread) + }) + } + + # reduce number of workers on large data sets + if (!is.null(pv$large_data_size) && task$nrow * task$ncol > pv$large_data_size) { + lg$debug("Task size larger than %i rows", pv$large_data_size) + + learner_ids = intersect(learner_ids, pv$large_data_learner_ids) + self$tuning_space = tuning_space[learner_ids] + lg$debug("Keeping %i learner(s): %s", length(learner_ids), paste0(learner_ids, collapse = ",")) + + lg$debug("Increasing number of threads per learner to %i", pv$large_data_nthread) + walk(learner_ids, function(learner_id) { + set_threads(self$graph$pipeops[[learner_id]]$learner, pv$large_data_nthread) + }) + n_workers = rush_config()$n_workers + n = max(1, floor(n_workers / pv$large_data_nthread)) + tuner$param_set$set_values(n_workers = n) + lg$debug("Reducing number of workers to %i", n) + } + + # small data resampling + resampling = if (!is.null(pv$small_data_size) && task$nrow < pv$small_data_size) { + lg$debug("Task has less than %i rows", pv$small_data_size) + lg$debug("Using small data set resampling with %i iterations", pv$small_data_resampling$iters) + pv$small_data_resampling + } else { + pv$resampling + } + + # cardinality + cardinality = map_int(task$col_info$levels, length) + if (!is.null(pv$max_cardinality) && any(cardinality > pv$max_cardinality)) { + lg$debug("Reducing number of factor levels to %i", pv$max_cardinality) + + # collapse factors + pipeop_ids = names(self$graph$pipeops) + pipeop_ids = pipeop_ids[grep("collapse", pipeop_ids)] + walk(pipeop_ids, function(pipeop_id) { + self$graph$pipeops[[pipeop_id]]$param_set$values$target_level_count = pv$max_cardinality + }) + } + + if ("extra_trees" %in% learner_ids && any(cardinality > pv$extra_trees_max_cardinality)) { + lg$debug("Reducing number of factor levels to %i for extra trees", pv$extra_trees_max_cardinality) + self$graph$pipeops$extra_trees_collapse$param_set$values$target_level_count = pv$extra_trees_max_cardinality + } + + # initialize graph learner + graph_learner = as_learner(self$graph) + graph_learner$id = "graph_learner" + graph_learner$predict_type = pv$measure$predict_type + graph_learner$fallback = lrn(paste0(task_type, ".featureless"), predict_type = pv$measure$predict_type) + graph_learner$encapsulate = c(train = "callr", predict = "callr") + graph_learner$timeout = c(train = pv$learner_timeout, predict = pv$learner_timeout) + + learners_with_validation = intersect(learner_ids, c("xgboost", "catboost", "lightgbm")) + if (length(learners_with_validation)) { + set_validate(graph_learner, "test", ids = learners_with_validation) + } + + # set early stopping + if ("xgboost" %in% learner_ids) { + graph_learner$param_set$values$xgboost.callbacks = list(cb_timeout_xgboost(pv$learner_timeout * 0.8)) + graph_learner$param_set$values$xgboost.eval_metric = pv$xgboost_eval_metric + } + if ("catboost" %in% learner_ids) { + graph_learner$param_set$values$catboost.eval_metric = pv$catboost_eval_metric + } + if ("lightgbm" %in% learner_ids) { + graph_learner$param_set$values$lightgbm.callbacks = list(cb_timeout_lightgbm(pv$learner_timeout * 0.8)) + graph_learner$param_set$values$lightgbm.eval = pv$lightgbm_eval_metric + } + + # initialize search space + tuning_space = unlist(unname(self$tuning_space), recursive = FALSE) + graph_scratch = graph_learner$clone(deep = TRUE) + graph_scratch$param_set$set_values(.values = tuning_space) + graph_scratch$param_set$set_values(branch.selection = to_tune(learner_ids)) + search_space = graph_scratch$param_set$search_space() + walk(learner_ids, function(learner_id) { + param_ids = search_space$ids() + param_ids = grep(paste0("^", learner_id), param_ids, value = TRUE) + walk(param_ids, function(param_id) { + # skip internal tuning parameter + if (param_id %in% c("xgboost.nrounds", "catboost.iterations", "lightgbm.num_iterations")) return() + search_space$add_dep( + id = param_id, + on = "branch.selection", + cond = CondEqual$new(learner_id) + ) + }) + }) + + # initial design + lhs_xdt = generate_lhs_design(pv$lhs_size, self$task_type, setdiff(learner_ids, c("lda", "extra_trees")), self$tuning_space) + default_xdt = generate_default_design(self$task_type, learner_ids, task, self$tuning_space) + initial_xdt = rbindlist(list(lhs_xdt, default_xdt), use.names = TRUE, fill = TRUE) + setorderv(initial_xdt, "branch.selection") + tuner$param_set$set_values(initial_design = initial_xdt) + + # initialize auto tuner + self$instance = ti_async( + task = task, + learner = graph_learner, + resampling = resampling, + measures = pv$measure, + terminator = pv$terminator, + search_space = search_space, + callbacks = pv$callbacks, + store_benchmark_result = pv$store_benchmark_result + ) + + # tune + lg$debug("Learner '%s' starts tuning phase", self$id) + tuner$optimize(self$instance) + + # fit final model + lg$debug("Learner '%s' fits final model", self$id) + if (length(learners_with_validation)) { + set_validate(graph_learner, NULL, ids = intersect(learner_ids, c("xgboost", "catboost", "lightgbm"))) + } + graph_learner$param_set$set_values(.values = self$instance$result_learner_param_vals, .insert = FALSE) + graph_learner$timeout = c(train = Inf, predict = Inf) + graph_learner$train(task) + + list(graph_learner = graph_learner, instance = self$instance) +} \ No newline at end of file From 90c47570afba3c13c1870f997f21ad808068b756 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Mon, 29 Jul 2024 15:57:41 +0200 Subject: [PATCH 07/34] fix: regr learner id --- R/LearnerRegrAuto.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/LearnerRegrAuto.R b/R/LearnerRegrAuto.R index 19dc8b8..6d93d86 100644 --- a/R/LearnerRegrAuto.R +++ b/R/LearnerRegrAuto.R @@ -22,7 +22,7 @@ LearnerRegrAuto = R6Class("LearnerRegrAuto", #' @description #' Creates a new instance of this [R6][R6::R6Class] class. - initialize = function(id = "classif.auto") { + initialize = function(id = "regr.auto") { param_set = ps( # learner learner_ids = p_uty(default = c("glmnet", "kknn", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm"), From 953940be71d0ad0297761f6fd95e7b75f5c3f373 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Mon, 29 Jul 2024 16:27:14 +0200 Subject: [PATCH 08/34] test: extra_trees and glmnet --- tests/testthat/test_LearnerClassifAuto.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test_LearnerClassifAuto.R b/tests/testthat/test_LearnerClassifAuto.R index 7eebf33..4ea0f26 100644 --- a/tests/testthat/test_LearnerClassifAuto.R +++ b/tests/testthat/test_LearnerClassifAuto.R @@ -211,7 +211,7 @@ test_that("extra_trees and glmnet works", { ) expect_class(learner$train(task), "LearnerClassifAuto") - expect_equal(learner$model$instance$result$branch.selection, "extra_trees") + expect_choice(learner$model$instance$result$branch.selection, c("extra_trees", "glmnet")) }) test_that("lightgbm works", { From 6d241534b8a0416483555d8c5cae9c4cffe0a106 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Sun, 4 Aug 2024 21:04:30 +0200 Subject: [PATCH 09/34] feat: configspace --- DESCRIPTION | 4 +- R/save_deepcave_run.R | 107 ++++++++++++++++++++++++ tests/testthat/test_save_deepcave_run.R | 24 ++++++ 3 files changed, 134 insertions(+), 1 deletion(-) create mode 100644 R/save_deepcave_run.R create mode 100644 tests/testthat/test_save_deepcave_run.R diff --git a/DESCRIPTION b/DESCRIPTION index dfc53bb..855771c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -27,7 +27,8 @@ Imports: mlr3tuningspaces, paradox (>= 1.0.1), R6, - utils + utils, + jsonlite Suggests: catboost, e1071, @@ -63,5 +64,6 @@ Collate: 'LearnerRegrAuto.R' 'helper.R' 'build_graph.R' + 'save_deepcave_run.R' 'train_auto.R' 'zzz.R' diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R new file mode 100644 index 0000000..f5c7453 --- /dev/null +++ b/R/save_deepcave_run.R @@ -0,0 +1,107 @@ +save_deepcave_run = function(learner, path = "logs/mlr3automl") { + jsonlite::write_json( + get_configspace(learner), + paste0(path, "/configspace.json"), + auto_unbox = TRUE, pretty = TRUE, null = "null" + ) + + jsonlite::write_json( + get_configs(learner), + paste0(path, "/configs.json"), + auto_unbox = TRUE, pretty = TRUE, null = "null" + ) + + jsonlite::write_json( + get_history(learner), + paste0(path, "/history.json"), + auto_unbox = TRUE, pretty = TRUE, null = "null" + ) + + jsonlite::write_json( + get_meta(learner), + paste0(path, "/meta.json"), + auto_unbox = TRUE, pretty = TRUE, null = "null" + ) + + # origins.json + origins = rep(list(NULL), learner$instance$archive$n_evals) + names(origins) = seq(learner$instance$archive$n_evals) - 1 + jsonlite::write_json( + origins, + paste0(path, "/origins.json"), + pretty = TRUE, null = "null" + ) +} + +get_configspace = function(learner) { + n_params = nrow(learner$instance$search_space$data) + + hyperparameters_list = lapply(seq_len(n_params), function(i) { + row = search_space$data[i, ] + name = row[["id"]] + type = switch(row[["class"]], + ParamFct = "categorical", + ParamLgl = "categorical", + ParamDbl = "uniform_float", + ParamInt = "uniform_int") + + # categorical params + if (type == "categorical") { + choices = unlist(row[["levels"]]) + # TBD: default + return(list( + name = name, + type = type, + choices = choices, + weights = NULL)) + } + + # int / float params + is_logscale = search_space$is_logscale[[name]] + lower = row[["lower"]] + upper = row[["upper"]] + if (is_logscale) { + lower = exp(lower) + upper = exp(upper) + } + # TBD: default + return(list( + name = name, + type = type, + log = is_logscale, + lower = lower, + upper = upper)) + }) + + conditions_list = lapply(seq_len(n_params), function(i) { + row = search_space$deps[i, ] + child = row[["id"]] + parent = row[["on"]] + + # `cond` (below) is a list of `Condition`s. Currently, there are only 'CondEqual' and 'CondAnyOf', + # which should not be used simultaneously. So this list should always contain only one entry. + cond = row[["cond"]][[1]] + if (is(cond, "CondEqual")) { + return(list(child = child, parent = parent, type = "EQ", value = cond$rhs)) + } + return(list(child = child, parent = parent, type = "IN", values = cond$rhs)) + }) + + return(list( + hyperparameters = hyperparameters_list, + conditions = conditions_list, + forbiddens = list() + )) +} + +get_configs = function(learner){ + list(TBD = "TBD") +} + +get_history = function(learner){ + list(TBD = "TBD") +} + +get_meta = function(learner){ + list(TBD = "TBD") +} diff --git a/tests/testthat/test_save_deepcave_run.R b/tests/testthat/test_save_deepcave_run.R new file mode 100644 index 0000000..05befdf --- /dev/null +++ b/tests/testthat/test_save_deepcave_run.R @@ -0,0 +1,24 @@ +test_that("run is saved", { + rush_plan(n_workers = 2) + skip_if_not_installed("e1071") + + task = tsk("penguins") + learner = lrn("classif.auto", + learner_ids = "svm", + small_data_size = 1, + resampling = rsmp("holdout"), + measure = msr("classif.ce"), + terminator = trm("evals", n_evals = 6) + ) + learner$train(task) + + dir = tempdir() + + save_deepcave_run(learner, path = paste0(dir)) + + expect_file_exists(paste0(dir, "/configspace.json")) + expect_file_exists(paste0(dir, "/configs.json")) + expect_file_exists(paste0(dir, "/history.json")) + expect_file_exists(paste0(dir, "/meta.json")) + expect_file_exists(paste0(dir, "/origins.json")) +}) From 27289f06d957ebba93a6bce4f5e99ba1f3a9f38b Mon Sep 17 00:00:00 2001 From: b-zhou Date: Thu, 8 Aug 2024 19:34:20 +0200 Subject: [PATCH 10/34] docs: save deepcave --- R/save_deepcave_run.R | 94 +++++++++++++++++++++++++++++++++---------- 1 file changed, 72 insertions(+), 22 deletions(-) diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R index f5c7453..8d291fa 100644 --- a/R/save_deepcave_run.R +++ b/R/save_deepcave_run.R @@ -1,40 +1,91 @@ -save_deepcave_run = function(learner, path = "logs/mlr3automl") { +#' @title Save Tuning History as a DeepCAVE Run +#' +#' @description +#' Exports information stored in a `TuningInstance` in a format recognized by [DeepCAVE](https://automl.github.io/DeepCAVE/main/index.html) as a run. Each run is stored as a folder containing five files `configs.json`, `configspace.json`, `history.jsonl`, `meta.json`, and `origins.json`. +#' +#' @param instance ([TuningInstanceAsyncSingleCrit]) +#' Tuning instance to save. +#' +#' @param path (`character(1)`) +#' Path to save the run. Defaults to `"logs/mlr3automl`. +#' +#' @param prefix (`character(1)`) +#' Prefix for the name of a new subfolder under `path` for storing the current run. +#' +#' @param overwrite (`character(1)`) +#' If `FALSE` (default), creates a new subfolder to save the current run. If `TRUE`, all existing runs will be deleted. +#' +#' @export +save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", overwrite = FALSE) { + # don't save untuned instance + if (!length(instance$result_learner_param_vals)) { + warning("No run is saved, because no tuning has been completed.") + return() + } + + # create a subfolder for saving the current run + # original Python implementation see `Recorder._set_path()` + # (https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/recorder.py) + if (!overwrite) { + new_idx = 0 + for (fn in list.files(path)) { + if (!startsWith(fn, "prefix")) next + idx = last(strsplit(fn, "_")[[1]]) + if (is.numeric(idx)) { + idx_int = as.integer(idx) + if (idx_int > new_idx) { + new_idx = idx_int + } + } + } + new_idx = new_idx + 1 + run_path = file.path(path, paste0(prefix, "_", new_idx)) + } + jsonlite::write_json( - get_configspace(learner), - paste0(path, "/configspace.json"), + get_configspace(instance), + paste0(run_path, "/configspace.json"), auto_unbox = TRUE, pretty = TRUE, null = "null" ) jsonlite::write_json( - get_configs(learner), - paste0(path, "/configs.json"), + get_configs(instance), + paste0(run_path, "/configs.json"), auto_unbox = TRUE, pretty = TRUE, null = "null" ) jsonlite::write_json( - get_history(learner), - paste0(path, "/history.json"), + get_history(instance), + paste0(run_path, "/history.json"), auto_unbox = TRUE, pretty = TRUE, null = "null" ) jsonlite::write_json( - get_meta(learner), - paste0(path, "/meta.json"), + get_meta(instance), + paste0(run_path, "/meta.json"), auto_unbox = TRUE, pretty = TRUE, null = "null" ) - # origins.json - origins = rep(list(NULL), learner$instance$archive$n_evals) - names(origins) = seq(learner$instance$archive$n_evals) - 1 + # create `origins.json` (a list of `null`s) + origins = rep(list(NULL), instance$instance$archive$n_evals) + names(origins) = seq(instance$instance$archive$n_evals) - 1 jsonlite::write_json( origins, - paste0(path, "/origins.json"), + paste0(run_path, "/origins.json"), pretty = TRUE, null = "null" ) } -get_configspace = function(learner) { - n_params = nrow(learner$instance$search_space$data) + +# Prepare the list for converting to `configs.json` +get_configs = function(learner){ + list(TBD = "TBD") +} + + +# Prepare the list for converting to `configspace.json` +get_configspace = function(instance) { + n_params = nrow(instance$search_space$data) hyperparameters_list = lapply(seq_len(n_params), function(i) { row = search_space$data[i, ] @@ -48,7 +99,7 @@ get_configspace = function(learner) { # categorical params if (type == "categorical") { choices = unlist(row[["levels"]]) - # TBD: default + # FIXME: the entry `default` is missing return(list( name = name, type = type, @@ -64,7 +115,7 @@ get_configspace = function(learner) { lower = exp(lower) upper = exp(upper) } - # TBD: default + # FIXME: the entry `default` entry is missing return(list( name = name, type = type, @@ -78,8 +129,9 @@ get_configspace = function(learner) { child = row[["id"]] parent = row[["on"]] - # `cond` (below) is a list of `Condition`s. Currently, there are only 'CondEqual' and 'CondAnyOf', - # which should not be used simultaneously. So this list should always contain only one entry. + # `cond` below is a list of `Condition`s. + # Currently, there are only 'CondEqual' and 'CondAnyOf', which should not be used simultaneously. + # So this list should always contain only one entry. cond = row[["cond"]][[1]] if (is(cond, "CondEqual")) { return(list(child = child, parent = parent, type = "EQ", value = cond$rhs)) @@ -94,14 +146,12 @@ get_configspace = function(learner) { )) } -get_configs = function(learner){ - list(TBD = "TBD") -} get_history = function(learner){ list(TBD = "TBD") } + get_meta = function(learner){ list(TBD = "TBD") } From 71e7de3324da222107a8f2e33e4e467332822ac4 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Sun, 11 Aug 2024 00:10:08 +0200 Subject: [PATCH 11/34] feat: configs.json --- R/save_deepcave_run.R | 45 +++++++++++++++++------ tests/testthat/test_save_deepcave_run.R | 48 +++++++++++++++++++++---- 2 files changed, 76 insertions(+), 17 deletions(-) diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R index 8d291fa..7c0f2fc 100644 --- a/R/save_deepcave_run.R +++ b/R/save_deepcave_run.R @@ -18,7 +18,7 @@ #' @export save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", overwrite = FALSE) { # don't save untuned instance - if (!length(instance$result_learner_param_vals)) { + if (is.null(instance$result_learner_param_vals)) { warning("No run is saved, because no tuning has been completed.") return() } @@ -40,6 +40,14 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", } new_idx = new_idx + 1 run_path = file.path(path, paste0(prefix, "_", new_idx)) + dir.create(run_path) + } else { + run_path = file.path(path, prefix) + if (file.exists(run_path)) { + lapply(list.files(run_path, full.names = TRUE), file.remove) + } else{ + dir.create(run_path) + } } jsonlite::write_json( @@ -67,8 +75,8 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", ) # create `origins.json` (a list of `null`s) - origins = rep(list(NULL), instance$instance$archive$n_evals) - names(origins) = seq(instance$instance$archive$n_evals) - 1 + origins = rep(list(NULL), instance$archive$n_evals) + names(origins) = seq(instance$archive$n_evals) - 1 jsonlite::write_json( origins, paste0(run_path, "/origins.json"), @@ -77,9 +85,24 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", } -# Prepare the list for converting to `configs.json` -get_configs = function(learner){ - list(TBD = "TBD") +# Prepare the lists for converting to `configs.json` +get_configs = function(instance){ + param_ids = instance$search_space$data[, id] + + configs_list = map(seq_len(instance$archive$n_evals), function(i) { + row = as.list(instance$archive$data[i, ]) + tuned_params = grep(paste0("^", row[["branch.selection"]]), param_ids, value = TRUE) + walk(tuned_params, function(param) { + if (instance$search_space$is_logscale[[param]]) { + row[[param]] = exp(row[[param]]) + } + }) + return(row[c("branch.selection", tuned_params)]) + }) + names(configs_list) = seq_along(configs_list) - 1 + jsonlite::toJSON(configs_list, auto_unbox = TRUE, null = "null", na = "null", pretty = TRUE) + + return(configs_list) } @@ -88,7 +111,7 @@ get_configspace = function(instance) { n_params = nrow(instance$search_space$data) hyperparameters_list = lapply(seq_len(n_params), function(i) { - row = search_space$data[i, ] + row = instance$search_space$data[i, ] name = row[["id"]] type = switch(row[["class"]], ParamFct = "categorical", @@ -108,7 +131,7 @@ get_configspace = function(instance) { } # int / float params - is_logscale = search_space$is_logscale[[name]] + is_logscale = instance$search_space$is_logscale[[name]] lower = row[["lower"]] upper = row[["upper"]] if (is_logscale) { @@ -125,7 +148,7 @@ get_configspace = function(instance) { }) conditions_list = lapply(seq_len(n_params), function(i) { - row = search_space$deps[i, ] + row = instance$search_space$deps[i, ] child = row[["id"]] parent = row[["on"]] @@ -147,11 +170,11 @@ get_configspace = function(instance) { } -get_history = function(learner){ +get_history = function(instance){ list(TBD = "TBD") } -get_meta = function(learner){ +get_meta = function(instance){ list(TBD = "TBD") } diff --git a/tests/testthat/test_save_deepcave_run.R b/tests/testthat/test_save_deepcave_run.R index 05befdf..d42e9aa 100644 --- a/tests/testthat/test_save_deepcave_run.R +++ b/tests/testthat/test_save_deepcave_run.R @@ -13,12 +13,48 @@ test_that("run is saved", { learner$train(task) dir = tempdir() + expected_path = file.path(dir, "test-1-run_1") + if (file.exists(expected_path)) { + lapply(list.files(expected_path, full.names = TRUE), file.remove) + } + file.remove(expected_path) - save_deepcave_run(learner, path = paste0(dir)) + save_deepcave_run(learner$instance, path = dir, prefix = "test-1-run", overwrite = FALSE) - expect_file_exists(paste0(dir, "/configspace.json")) - expect_file_exists(paste0(dir, "/configs.json")) - expect_file_exists(paste0(dir, "/history.json")) - expect_file_exists(paste0(dir, "/meta.json")) - expect_file_exists(paste0(dir, "/origins.json")) + expect_file_exists(file.path(expected_path, "configspace.json")) + expect_file_exists(file.path(expected_path, "configs.json")) + # expect_file_exists(file.path(expected_path, "history.jsonl")) + expect_file_exists(file.path(expected_path, "meta.json")) + expect_file_exists(file.path(expected_path, "origins.json")) +}) + +test_that("overwriting works", { + dir = tempdir() + expected_path = file.path(dir, "test-run-overwrite") + file.create(file.path(expected_path, "configs.json"), showWarnings = FALSE) + file.create(file.path(expected_path, "configspace.json"), showWarnings = FALSE) + file.create(file.path(expected_path, "history.jsonl"), showWarnings = FALSE) + file.create(file.path(expected_path, "meta.json"), showWarnings = FALSE) + file.create(file.path(expected_path, "origins.json"), showWarnings = FALSE) + + rush_plan(n_workers = 2) + skip_if_not_installed("e1071") + + task = tsk("penguins") + learner = lrn("classif.auto", + learner_ids = "svm", + small_data_size = 1, + resampling = rsmp("holdout"), + measure = msr("classif.ce"), + terminator = trm("evals", n_evals = 6) + ) + learner$train(task) + + save_deepcave_run(learner$instance, path = dir, prefix = "test-run-overwrite", overwrite = TRUE) + + expect_file_exists(file.path(expected_path, "configspace.json")) + expect_file_exists(file.path(expected_path, "configs.json")) + # expect_file_exists(file.path(expected_path, "history.jsonl")) + expect_file_exists(file.path(expected_path, "meta.json")) + expect_file_exists(file.path(expected_path, "origins.json")) }) From 26706d73ebac4bf8efe784787acee10136de8d17 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Sun, 11 Aug 2024 15:21:45 +0200 Subject: [PATCH 12/34] feat: history.jsonl --- R/save_deepcave_run.R | 45 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R index 7c0f2fc..864c39d 100644 --- a/R/save_deepcave_run.R +++ b/R/save_deepcave_run.R @@ -62,17 +62,46 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", auto_unbox = TRUE, pretty = TRUE, null = "null" ) - jsonlite::write_json( - get_history(instance), - paste0(run_path, "/history.json"), - auto_unbox = TRUE, pretty = TRUE, null = "null" - ) + # jsonlite::write_json( + # get_history(instance), + # paste0(run_path, "/history.json"), + # auto_unbox = TRUE, pretty = TRUE, null = "null" + # ) jsonlite::write_json( get_meta(instance), paste0(run_path, "/meta.json"), auto_unbox = TRUE, pretty = TRUE, null = "null" ) + + # stream out `history.jsonl` + n_evals = instance$archive$n_evals + # FIXME: make time an optional cost + costs = c(instance$objective$codomain$data[, id], "runtime_learners") + selected_cols = c(costs, "timestamp_xs", "timestamp_ys", "state") + history_table = instance$archive$data[, ..selected_cols][, .( + config_id = seq_len(n_evals) - 1, + budget = 0, + seed = -1, + costs = lapply(transpose(.SD), c), + # handle start and end time (time elapsed since first timestamp) + # see https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/recorder.py + start_time = as.numeric(timestamp_xs - timestamp_xs[1]), + end_time = as.numeric(timestamp_ys - timestamp_ys[1]), + # state is either "finished" (SUCESS = 1) or "queued" (NOT_EVALUATED = 0) + # see https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/status.py + state = ifelse(state == "finished", 1, 6), + additionals = list() + ), .SDcols = costs] + + con = file("history.jsonl", open = "w") + jsonlite::stream_out( + history_table, + con, + auto_unbox = TRUE, pretty = TRUE, null = "list", na = "null", + dataframe = "values" + ) + close(con) # create `origins.json` (a list of `null`s) origins = rep(list(NULL), instance$archive$n_evals) @@ -170,9 +199,9 @@ get_configspace = function(instance) { } -get_history = function(instance){ - list(TBD = "TBD") -} +# get_history = function(instance){ +# list(TBD = "TBD") +# } get_meta = function(instance){ From 836c873bf77e47eb38f0c5ecd2a04ae7233082dd Mon Sep 17 00:00:00 2001 From: b-zhou Date: Sun, 11 Aug 2024 15:50:46 +0200 Subject: [PATCH 13/34] fix: save jsonl --- R/save_deepcave_run.R | 20 ++++---------------- tests/testthat/test_save_deepcave_run.R | 6 +++--- 2 files changed, 7 insertions(+), 19 deletions(-) diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R index 864c39d..2a7c5a2 100644 --- a/R/save_deepcave_run.R +++ b/R/save_deepcave_run.R @@ -28,8 +28,8 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", # (https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/recorder.py) if (!overwrite) { new_idx = 0 - for (fn in list.files(path)) { - if (!startsWith(fn, "prefix")) next + walk(list.files(path), function(fn) { + if (!startsWith(fn, "prefix")) return() idx = last(strsplit(fn, "_")[[1]]) if (is.numeric(idx)) { idx_int = as.integer(idx) @@ -37,7 +37,7 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", new_idx = idx_int } } - } + }) new_idx = new_idx + 1 run_path = file.path(path, paste0(prefix, "_", new_idx)) dir.create(run_path) @@ -62,12 +62,6 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", auto_unbox = TRUE, pretty = TRUE, null = "null" ) - # jsonlite::write_json( - # get_history(instance), - # paste0(run_path, "/history.json"), - # auto_unbox = TRUE, pretty = TRUE, null = "null" - # ) - jsonlite::write_json( get_meta(instance), paste0(run_path, "/meta.json"), @@ -94,7 +88,7 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", additionals = list() ), .SDcols = costs] - con = file("history.jsonl", open = "w") + con = file(file.path(run_path, "history.jsonl"), open = "w") jsonlite::stream_out( history_table, con, @@ -198,12 +192,6 @@ get_configspace = function(instance) { )) } - -# get_history = function(instance){ -# list(TBD = "TBD") -# } - - get_meta = function(instance){ list(TBD = "TBD") } diff --git a/tests/testthat/test_save_deepcave_run.R b/tests/testthat/test_save_deepcave_run.R index d42e9aa..d24cd47 100644 --- a/tests/testthat/test_save_deepcave_run.R +++ b/tests/testthat/test_save_deepcave_run.R @@ -16,14 +16,14 @@ test_that("run is saved", { expected_path = file.path(dir, "test-1-run_1") if (file.exists(expected_path)) { lapply(list.files(expected_path, full.names = TRUE), file.remove) + file.remove(expected_path) } - file.remove(expected_path) save_deepcave_run(learner$instance, path = dir, prefix = "test-1-run", overwrite = FALSE) expect_file_exists(file.path(expected_path, "configspace.json")) expect_file_exists(file.path(expected_path, "configs.json")) - # expect_file_exists(file.path(expected_path, "history.jsonl")) + expect_file_exists(file.path(expected_path, "history.jsonl")) expect_file_exists(file.path(expected_path, "meta.json")) expect_file_exists(file.path(expected_path, "origins.json")) }) @@ -54,7 +54,7 @@ test_that("overwriting works", { expect_file_exists(file.path(expected_path, "configspace.json")) expect_file_exists(file.path(expected_path, "configs.json")) - # expect_file_exists(file.path(expected_path, "history.jsonl")) + expect_file_exists(file.path(expected_path, "history.jsonl")) expect_file_exists(file.path(expected_path, "meta.json")) expect_file_exists(file.path(expected_path, "origins.json")) }) From 2962a041d1397736de596eda7b0ee47f0b3ff649 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Sun, 11 Aug 2024 16:59:33 +0200 Subject: [PATCH 14/34] feat: meta.json --- R/save_deepcave_run.R | 105 +++++++++++++++++++++++++++++------------- 1 file changed, 74 insertions(+), 31 deletions(-) diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R index 2a7c5a2..c684837 100644 --- a/R/save_deepcave_run.R +++ b/R/save_deepcave_run.R @@ -50,54 +50,39 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", } } + + # `configspace.json` jsonlite::write_json( get_configspace(instance), paste0(run_path, "/configspace.json"), auto_unbox = TRUE, pretty = TRUE, null = "null" ) - + + # `configs.json` jsonlite::write_json( get_configs(instance), - paste0(run_path, "/configs.json"), + file.path(run_path, "configs.json"), auto_unbox = TRUE, pretty = TRUE, null = "null" ) + # `meta.json` jsonlite::write_json( get_meta(instance), - paste0(run_path, "/meta.json"), + file.path(run_path, "meta.json"), auto_unbox = TRUE, pretty = TRUE, null = "null" ) - # stream out `history.jsonl` - n_evals = instance$archive$n_evals - # FIXME: make time an optional cost - costs = c(instance$objective$codomain$data[, id], "runtime_learners") - selected_cols = c(costs, "timestamp_xs", "timestamp_ys", "state") - history_table = instance$archive$data[, ..selected_cols][, .( - config_id = seq_len(n_evals) - 1, - budget = 0, - seed = -1, - costs = lapply(transpose(.SD), c), - # handle start and end time (time elapsed since first timestamp) - # see https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/recorder.py - start_time = as.numeric(timestamp_xs - timestamp_xs[1]), - end_time = as.numeric(timestamp_ys - timestamp_ys[1]), - # state is either "finished" (SUCESS = 1) or "queued" (NOT_EVALUATED = 0) - # see https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/status.py - state = ifelse(state == "finished", 1, 6), - additionals = list() - ), .SDcols = costs] - + # `history.jsonl` con = file(file.path(run_path, "history.jsonl"), open = "w") jsonlite::stream_out( - history_table, + get_history(instance), con, auto_unbox = TRUE, pretty = TRUE, null = "list", na = "null", dataframe = "values" ) close(con) - # create `origins.json` (a list of `null`s) + # `origins.json` (a list of `null`s) origins = rep(list(NULL), instance$archive$n_evals) names(origins) = seq(instance$archive$n_evals) - 1 jsonlite::write_json( @@ -122,8 +107,8 @@ get_configs = function(instance){ }) return(row[c("branch.selection", tuned_params)]) }) + names(configs_list) = seq_along(configs_list) - 1 - jsonlite::toJSON(configs_list, auto_unbox = TRUE, null = "null", na = "null", pretty = TRUE) return(configs_list) } @@ -133,7 +118,7 @@ get_configs = function(instance){ get_configspace = function(instance) { n_params = nrow(instance$search_space$data) - hyperparameters_list = lapply(seq_len(n_params), function(i) { + hyperparameters_list = map(seq_len(n_params), function(i) { row = instance$search_space$data[i, ] name = row[["id"]] type = switch(row[["class"]], @@ -150,7 +135,8 @@ get_configspace = function(instance) { name = name, type = type, choices = choices, - weights = NULL)) + weights = NULL + )) } # int / float params @@ -167,10 +153,11 @@ get_configspace = function(instance) { type = type, log = is_logscale, lower = lower, - upper = upper)) + upper = upper + )) }) - conditions_list = lapply(seq_len(n_params), function(i) { + conditions_list = map(seq_len(n_params), function(i) { row = instance$search_space$deps[i, ] child = row[["id"]] parent = row[["on"]] @@ -192,6 +179,62 @@ get_configspace = function(instance) { )) } +get_history = function(instance) { + selected_cols = c(costs, "timestamp_xs", "timestamp_ys", "state") + history_table = instance$archive$data[, ..selected_cols][, .( + config_id = seq_len(n_evals) - 1, + budget = 0, + seed = -1, + costs = lapply(transpose(.SD), c), + # handle start and end time (time elapsed since first timestamp) + # see https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/recorder.py + start_time = as.numeric(timestamp_xs - timestamp_xs[1]), + end_time = as.numeric(timestamp_ys - timestamp_ys[1]), + # state is either "finished" (SUCESS = 1) or "queued" (NOT_EVALUATED = 0) + # see https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/status.py + state = ifelse(state == "finished", 1, 6), + additionals = list() + ), .SDcols = costs] + + return(history_table) +} + get_meta = function(instance){ - list(TBD = "TBD") + costs = instance$objective$codomain$data[, id] + + objectives_list = map(costs, function(cost) { + measure = msr(cost) + + lower = measure$range[[1]] + if (is.finite(lower)) { + lock_lower = TRUE + } else { + lower = min(instance$archive$data[, ..cost]) + lock_lower = FALSE + } + + upper = measure$range[[2]] + if (is.finite(upper)) { + lock_upper = TRUE + } else { + upper = max(instance$archive$data[, ..cost]) + lock_upper = FALSE + } + + optimize = if (measure$minimize) { + "lower" + } else { + "upper" + } + + return(list(name = cost, lower = lower, upper = upper, + lock_lower = lock_lower, lock_upper = lock_upper, optimize = optimize)) + + }) + + return(list( + objectives = objectives_list, + budgets = rep(list(0), instance$archive$n_evals), + seeds = list(-1) + )) } From 02647285bfce98e36015bdef182f10a590f7e590 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Sun, 11 Aug 2024 17:01:29 +0200 Subject: [PATCH 15/34] fix: jsonl verbose --- R/save_deepcave_run.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R index c684837..870dd70 100644 --- a/R/save_deepcave_run.R +++ b/R/save_deepcave_run.R @@ -78,7 +78,8 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", get_history(instance), con, auto_unbox = TRUE, pretty = TRUE, null = "list", na = "null", - dataframe = "values" + dataframe = "values", + verbose = FALSE ) close(con) From 2e8ec9472f939cacdcab36e322595bbaf03d5b34 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Sun, 11 Aug 2024 19:00:25 +0200 Subject: [PATCH 16/34] fix: conditions --- R/save_deepcave_run.R | 49 ++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R index 870dd70..c6b5f0b 100644 --- a/R/save_deepcave_run.R +++ b/R/save_deepcave_run.R @@ -84,8 +84,8 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", close(con) # `origins.json` (a list of `null`s) - origins = rep(list(NULL), instance$archive$n_evals) - names(origins) = seq(instance$archive$n_evals) - 1 + origins = rep(list(NULL), nrow(instance$archive$data)) + names(origins) = seq(nrow(instance$archive$data)) - 1 jsonlite::write_json( origins, paste0(run_path, "/origins.json"), @@ -98,15 +98,15 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", get_configs = function(instance){ param_ids = instance$search_space$data[, id] - configs_list = map(seq_len(instance$archive$n_evals), function(i) { - row = as.list(instance$archive$data[i, ]) - tuned_params = grep(paste0("^", row[["branch.selection"]]), param_ids, value = TRUE) + configs_list = map(seq_len(nrow(instance$archive$data)), function(i) { + config = as.list(instance$archive$data[i, ]) + tuned_params = grep(paste0("^", config[["branch.selection"]]), param_ids, value = TRUE) walk(tuned_params, function(param) { if (instance$search_space$is_logscale[[param]]) { - row[[param]] = exp(row[[param]]) + config[[param]] = exp(config[[param]]) } }) - return(row[c("branch.selection", tuned_params)]) + return(discard(config[c("branch.selection", tuned_params)], is.na)) }) names(configs_list) = seq_along(configs_list) - 1 @@ -118,6 +118,7 @@ get_configs = function(instance){ # Prepare the list for converting to `configspace.json` get_configspace = function(instance) { n_params = nrow(instance$search_space$data) + param_ids = instance$search_space$data[, id] hyperparameters_list = map(seq_len(n_params), function(i) { row = instance$search_space$data[i, ] @@ -131,12 +132,13 @@ get_configspace = function(instance) { # categorical params if (type == "categorical") { choices = unlist(row[["levels"]]) - # FIXME: the entry `default` is missing + # FIXME: `default` is wrong return(list( name = name, type = type, choices = choices, - weights = NULL + default = choices[[1]], + probabilisties = NULL )) } @@ -144,33 +146,40 @@ get_configspace = function(instance) { is_logscale = instance$search_space$is_logscale[[name]] lower = row[["lower"]] upper = row[["upper"]] + default = mean(lower, upper) if (is_logscale) { lower = exp(lower) upper = exp(upper) + default = exp(default) } - # FIXME: the entry `default` entry is missing + # FIXME: default is wrong return(list( name = name, type = type, log = is_logscale, lower = lower, - upper = upper + upper = upper, + default = default, + q = NULL )) }) - conditions_list = map(seq_len(n_params), function(i) { - row = instance$search_space$deps[i, ] - child = row[["id"]] - parent = row[["on"]] + conditions_list = map(setdiff(param_ids, "branch.selection"), function(param_id) { + dependency = instance$search_space$deps[id == param_id, ] + if (nrow(dependency) > 1) { + dependency = dependency[on != "branch.selection", ] + } + child = param_id + parent = dependency[["on"]] # `cond` below is a list of `Condition`s. # Currently, there are only 'CondEqual' and 'CondAnyOf', which should not be used simultaneously. # So this list should always contain only one entry. - cond = row[["cond"]][[1]] + cond = dependency[["cond"]][[1]] if (is(cond, "CondEqual")) { return(list(child = child, parent = parent, type = "EQ", value = cond$rhs)) - } - return(list(child = child, parent = parent, type = "IN", values = cond$rhs)) + } + return(list(child = child, parent = parent, type = "IN", values = cond$rhs)) }) return(list( @@ -181,9 +190,11 @@ get_configspace = function(instance) { } get_history = function(instance) { + costs = instance$objective$codomain$data[, id] + selected_cols = c(costs, "timestamp_xs", "timestamp_ys", "state") history_table = instance$archive$data[, ..selected_cols][, .( - config_id = seq_len(n_evals) - 1, + config_id = seq_len(nrow(instance$archive$data)) - 1, budget = 0, seed = -1, costs = lapply(transpose(.SD), c), From 04e4ee7c7734f6054c3cf3635d4d7926d51ad078 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Sun, 11 Aug 2024 20:26:27 +0200 Subject: [PATCH 17/34] fix: configs --- R/save_deepcave_run.R | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R index c6b5f0b..4480bd8 100644 --- a/R/save_deepcave_run.R +++ b/R/save_deepcave_run.R @@ -97,18 +97,13 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", # Prepare the lists for converting to `configs.json` get_configs = function(instance){ param_ids = instance$search_space$data[, id] + logscale_params = param_ids[instance$search_space$is_logscale[param_ids]] + config_table = instance$archive$data[, param_ids, with = FALSE] + config_table[, (logscale_params) := lapply(.SD, exp), .SDcols = logscale_params] - configs_list = map(seq_len(nrow(instance$archive$data)), function(i) { - config = as.list(instance$archive$data[i, ]) - tuned_params = grep(paste0("^", config[["branch.selection"]]), param_ids, value = TRUE) - walk(tuned_params, function(param) { - if (instance$search_space$is_logscale[[param]]) { - config[[param]] = exp(config[[param]]) - } - }) - return(discard(config[c("branch.selection", tuned_params)], is.na)) - }) - + configs_list = map(seq_len(nrow(config_table)), function(i) { + discard(as.list(config_table[i, ]), is.na) + }) names(configs_list) = seq_along(configs_list) - 1 return(configs_list) @@ -190,7 +185,7 @@ get_configspace = function(instance) { } get_history = function(instance) { - costs = instance$objective$codomain$data[, id] + costs = c(instance$objective$codomain$data[, id], "runtime_learners") selected_cols = c(costs, "timestamp_xs", "timestamp_ys", "state") history_table = instance$archive$data[, ..selected_cols][, .( @@ -202,9 +197,9 @@ get_history = function(instance) { # see https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/recorder.py start_time = as.numeric(timestamp_xs - timestamp_xs[1]), end_time = as.numeric(timestamp_ys - timestamp_ys[1]), - # state is either "finished" (SUCESS = 1) or "queued" (NOT_EVALUATED = 0) + # state is either "finished" <=> SUCESS = 1 or ABORTED = 0 # see https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/status.py - state = ifelse(state == "finished", 1, 6), + state = ifelse(state == "finished", 1, 5), additionals = list() ), .SDcols = costs] From b1c7d1e779de82c90bf0ed6a4997f906b3a03089 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Sun, 11 Aug 2024 20:26:52 +0200 Subject: [PATCH 18/34] fix: budget --- R/save_deepcave_run.R | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R index 4480bd8..57dfb40 100644 --- a/R/save_deepcave_run.R +++ b/R/save_deepcave_run.R @@ -236,12 +236,20 @@ get_meta = function(instance){ return(list(name = cost, lower = lower, upper = upper, lock_lower = lock_lower, lock_upper = lock_upper, optimize = optimize)) - }) + objectives_list = c(objectives_list, list(list( + name = "time", + lower = 0, + upper = max(instance$archive$data[, runtime_learners]), + lock_lower = TRUE, + lock_upper = FALSE, + optimize = "lower" + ))) + return(list( objectives = objectives_list, - budgets = rep(list(0), instance$archive$n_evals), + budgets = list(0), seeds = list(-1) )) } From 15722db53c0a28cba06f1b786f1524e4dad28310 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Sun, 11 Aug 2024 20:54:27 +0200 Subject: [PATCH 19/34] refactor: comments --- R/save_deepcave_run.R | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R index 57dfb40..cbba48f 100644 --- a/R/save_deepcave_run.R +++ b/R/save_deepcave_run.R @@ -94,7 +94,7 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", } -# Prepare the lists for converting to `configs.json` +# Prepare the list for converting to `configs.json` get_configs = function(instance){ param_ids = instance$search_space$data[, id] logscale_params = param_ids[instance$search_space$is_logscale[param_ids]] @@ -115,9 +115,9 @@ get_configspace = function(instance) { n_params = nrow(instance$search_space$data) param_ids = instance$search_space$data[, id] - hyperparameters_list = map(seq_len(n_params), function(i) { - row = instance$search_space$data[i, ] - name = row[["id"]] + hyperparameters_list = map(param_ids, function(param_id) { + row = instance$search_space$data[id == param_id, ] + type = switch(row[["class"]], ParamFct = "categorical", ParamLgl = "categorical", @@ -127,29 +127,29 @@ get_configspace = function(instance) { # categorical params if (type == "categorical") { choices = unlist(row[["levels"]]) - # FIXME: `default` is wrong return(list( - name = name, + name = param_id, type = type, choices = choices, + # FIXME: `default` is wrong default = choices[[1]], probabilisties = NULL )) } # int / float params - is_logscale = instance$search_space$is_logscale[[name]] + is_logscale = instance$search_space$is_logscale[[param_id]] lower = row[["lower"]] upper = row[["upper"]] + # FIXME: default is wrong default = mean(lower, upper) if (is_logscale) { lower = exp(lower) upper = exp(upper) default = exp(default) } - # FIXME: default is wrong return(list( - name = name, + name = param_id, type = type, log = is_logscale, lower = lower, @@ -161,6 +161,9 @@ get_configspace = function(instance) { conditions_list = map(setdiff(param_ids, "branch.selection"), function(param_id) { dependency = instance$search_space$deps[id == param_id, ] + # `svm.degree` and `svm.gamma` depends on `svm.kernel` as well as `branch.selection`. + # DeepCAVE does not allow one parameter to be conditioned on multiple others. + # So remove their dependency on `branch.selection`. if (nrow(dependency) > 1) { dependency = dependency[on != "branch.selection", ] } @@ -184,6 +187,7 @@ get_configspace = function(instance) { )) } +# Prepare the data.table for converting to `history.jsonl` get_history = function(instance) { costs = c(instance$objective$codomain$data[, id], "runtime_learners") @@ -192,12 +196,14 @@ get_history = function(instance) { config_id = seq_len(nrow(instance$archive$data)) - 1, budget = 0, seed = -1, + # combine costs into a list column costs = lapply(transpose(.SD), c), # handle start and end time (time elapsed since first timestamp) # see https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/recorder.py + # start and end time here might having different meanings than the original implementation start_time = as.numeric(timestamp_xs - timestamp_xs[1]), end_time = as.numeric(timestamp_ys - timestamp_ys[1]), - # state is either "finished" <=> SUCESS = 1 or ABORTED = 0 + # state is either "finished" <=> SUCESS = 1 or ABORTED = 5 # see https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/status.py state = ifelse(state == "finished", 1, 5), additionals = list() @@ -206,7 +212,10 @@ get_history = function(instance) { return(history_table) } + +# Prepare the list for converting to 'meta.json' get_meta = function(instance){ + # time is handled separately below costs = instance$objective$codomain$data[, id] objectives_list = map(costs, function(cost) { @@ -236,8 +245,8 @@ get_meta = function(instance){ return(list(name = cost, lower = lower, upper = upper, lock_lower = lock_lower, lock_upper = lock_upper, optimize = optimize)) - }) - + }) + objectives_list = c(objectives_list, list(list( name = "time", lower = 0, From 7412811e5a08bb59a5b563187850ec2931a96c96 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Sun, 11 Aug 2024 20:55:26 +0200 Subject: [PATCH 20/34] chore: gitignore local testing files --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 5387b99..626665b 100644 --- a/.gitignore +++ b/.gitignore @@ -97,3 +97,4 @@ rsconnect/ /attic/ .Rprofile kaggle/ +deepcave \ No newline at end of file From a0e36e30e4f51aa27e3a6d87c69a8f41baa8bb2f Mon Sep 17 00:00:00 2001 From: b-zhou Date: Tue, 13 Aug 2024 14:28:16 +0200 Subject: [PATCH 21/34] revert: remove regr related stuff This reverts commit 953940b, 90c4757, 2927164, 823bced, 4bb4d0c, 2c99b98, 30de063, 766bccd --- DESCRIPTION | 3 - R/LearnerClassifAuto.R | 343 ++++++++++++++++++++++- R/LearnerRegrAuto.R | 66 +---- R/build_graph.R | 202 ------------- R/train_auto.R | 155 ---------- tests/testthat/test_LearnerClassifAuto.R | 2 +- tests/testthat/test_LearnerRegrAuto.R | 296 ------------------- 7 files changed, 346 insertions(+), 721 deletions(-) delete mode 100644 R/build_graph.R delete mode 100644 R/train_auto.R delete mode 100644 tests/testthat/test_LearnerRegrAuto.R diff --git a/DESCRIPTION b/DESCRIPTION index 855771c..938ca9c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -63,7 +63,4 @@ Collate: 'LearnerClassifAutoXgboost.R' 'LearnerRegrAuto.R' 'helper.R' - 'build_graph.R' - 'save_deepcave_run.R' - 'train_auto.R' 'zzz.R' diff --git a/R/LearnerClassifAuto.R b/R/LearnerClassifAuto.R index 1d740b0..b3b126e 100644 --- a/R/LearnerClassifAuto.R +++ b/R/LearnerClassifAuto.R @@ -87,7 +87,159 @@ LearnerClassifAuto = R6Class("LearnerClassifAuto", private = list( .train = function(task) { - train_auto(self, task, task_type = "classif") + pv = self$param_set$values + learner_ids = pv$learner_ids + self$graph = build_graph(learner_ids) + self$tuning_space = tuning_space[learner_ids] + + lg$debug("Training '%s' on task '%s'", self$id, task$id) + + # initialize mbo tuner + tuner = tnr("adbo") + + # remove learner based on memory limit + lg$debug("Starting to select from %i learners: %s", length(learner_ids), paste0(learner_ids, collapse = ",")) + + if (!is.null(pv$max_memory)) { + memory_usage = map_dbl(learner_ids, function(learner_id) { + self$graph$pipeops[[learner_id]]$learner$estimate_memory_usage(task) / 1e6 + }) + learner_ids = learner_ids[memory_usage < pv$max_memory] + lg$debug("Checking learners for memory limit of %i MB. Keeping %i learner(s): %s", pv$max_memory, length(learner_ids), paste0(learner_ids, collapse = ",")) + } + + # set number of threads + if (!is.null(pv$max_nthread)) { + lg$debug("Setting number of threads per learner to %i", pv$max_nthread) + walk(learner_ids, function(learner_id) { + set_threads(self$graph$pipeops[[learner_id]]$learner, pv$max_nthread) + }) + } + + # reduce number of workers on large data sets + if (!is.null(pv$large_data_size) && task$nrow * task$ncol > pv$large_data_size) { + lg$debug("Task size larger than %i rows", pv$large_data_size) + + learner_ids = intersect(learner_ids, pv$large_data_learner_ids) + self$tuning_space = tuning_space[learner_ids] + lg$debug("Keeping %i learner(s): %s", length(learner_ids), paste0(learner_ids, collapse = ",")) + + lg$debug("Increasing number of threads per learner to %i", pv$large_data_nthread) + walk(learner_ids, function(learner_id) { + set_threads(self$graph$pipeops[[learner_id]]$learner, pv$large_data_nthread) + }) + n_workers = rush_config()$n_workers + n = max(1, floor(n_workers / pv$large_data_nthread)) + tuner$param_set$set_values(n_workers = n) + lg$debug("Reducing number of workers to %i", n) + } + + # small data resampling + resampling = if (!is.null(pv$small_data_size) && task$nrow < pv$small_data_size) { + lg$debug("Task has less than %i rows", pv$small_data_size) + lg$debug("Using small data set resampling with %i iterations", pv$small_data_resampling$iters) + pv$small_data_resampling + } else { + pv$resampling + } + + # cardinality + cardinality = map_int(task$col_info$levels, length) + if (!is.null(pv$max_cardinality) && any(cardinality > pv$max_cardinality)) { + lg$debug("Reducing number of factor levels to %i", pv$max_cardinality) + + # collapse factors + pipeop_ids = names(self$graph$pipeops) + pipeop_ids = pipeop_ids[grep("collapse", pipeop_ids)] + walk(pipeop_ids, function(pipeop_id) { + self$graph$pipeops[[pipeop_id]]$param_set$values$target_level_count = pv$max_cardinality + }) + } + + if ("extra_trees" %in% learner_ids && any(cardinality > pv$extra_trees_max_cardinality)) { + lg$debug("Reducing number of factor levels to %i for extra trees", pv$extra_trees_max_cardinality) + self$graph$pipeops$extra_trees_collapse$param_set$values$target_level_count = pv$extra_trees_max_cardinality + } + + # initialize graph learner + graph_learner = as_learner(self$graph) + graph_learner$id = "graph_learner" + graph_learner$predict_type = pv$measure$predict_type + graph_learner$fallback = lrn("classif.featureless", predict_type = pv$measure$predict_type) + graph_learner$encapsulate = c(train = "callr", predict = "callr") + graph_learner$timeout = c(train = pv$learner_timeout, predict = pv$learner_timeout) + + learners_with_validation = intersect(learner_ids, c("xgboost", "catboost", "lightgbm")) + if (length(learners_with_validation)) { + set_validate(graph_learner, "test", ids = learners_with_validation) + } + + # set early stopping + if ("xgboost" %in% learner_ids) { + graph_learner$param_set$values$xgboost.callbacks = list(cb_timeout_xgboost(pv$learner_timeout * 0.8)) + graph_learner$param_set$values$xgboost.eval_metric = pv$xgboost_eval_metric + } + if ("catboost" %in% learner_ids) { + graph_learner$param_set$values$catboost.eval_metric = pv$catboost_eval_metric + } + if ("lightgbm" %in% learner_ids) { + graph_learner$param_set$values$lightgbm.callbacks = list(cb_timeout_lightgbm(pv$learner_timeout * 0.8)) + graph_learner$param_set$values$lightgbm.eval = pv$lightgbm_eval_metric + } + + # initialize search space + tuning_space = unlist(unname(self$tuning_space), recursive = FALSE) + graph_scratch = graph_learner$clone(deep = TRUE) + graph_scratch$param_set$set_values(.values = tuning_space) + graph_scratch$param_set$set_values(branch.selection = to_tune(learner_ids)) + search_space = graph_scratch$param_set$search_space() + walk(learner_ids, function(learner_id) { + param_ids = search_space$ids() + param_ids = grep(paste0("^", learner_id), param_ids, value = TRUE) + walk(param_ids, function(param_id) { + # skip internal tuning parameter + if (param_id %in% c("xgboost.nrounds", "catboost.iterations", "lightgbm.num_iterations")) return() + search_space$add_dep( + id = param_id, + on = "branch.selection", + cond = CondEqual$new(learner_id) + ) + }) + }) + + # initial design + lhs_xdt = generate_lhs_design(pv$lhs_size, self$task_type, setdiff(learner_ids, c("lda", "extra_trees")), self$tuning_space) + default_xdt = generate_default_design(self$task_type, learner_ids, task, self$tuning_space) + initial_xdt = rbindlist(list(lhs_xdt, default_xdt), use.names = TRUE, fill = TRUE) + setorderv(initial_xdt, "branch.selection") + tuner$param_set$set_values(initial_design = initial_xdt) + + # initialize auto tuner + self$instance = ti_async( + task = task, + learner = graph_learner, + resampling = resampling, + measures = pv$measure, + terminator = pv$terminator, + search_space = search_space, + callbacks = pv$callbacks, + store_benchmark_result = pv$store_benchmark_result + ) + + # tune + lg$debug("Learner '%s' starts tuning phase", self$id) + tuner$optimize(self$instance) + + # fit final model + lg$debug("Learner '%s' fits final model", self$id) + if (length(learners_with_validation)) { + set_validate(graph_learner, NULL, ids = intersect(learner_ids, c("xgboost", "catboost", "lightgbm"))) + } + graph_learner$param_set$set_values(.values = self$instance$result_learner_param_vals, .insert = FALSE) + graph_learner$timeout = c(train = Inf, predict = Inf) + graph_learner$train(task) + + list(graph_learner = graph_learner, instance = self$instance) }, .predict = function(task) { @@ -99,3 +251,192 @@ LearnerClassifAuto = R6Class("LearnerClassifAuto", #' @include aaa.R learners[["classif.auto"]] = LearnerClassifAuto + +build_graph = function(learner_ids) { + branches = list() + # glmnet + if ("glmnet" %in% learner_ids) { + branch_glmnet = po("removeconstants", id = "glmnet_removeconstants") %>>% + po("imputehist", id = "glmnet_imputehist") %>>% + po("imputeoor", id = "glmnet_imputeoor") %>>% + po("fixfactors", id = "glmnet_fixfactors") %>>% + po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "glmnet_imputesample") %>>% + po("collapsefactors", target_level_count = 100, id = "glmnet_collapse") %>>% + po("encode", method = "one-hot", id = "glmnet_encode") %>>% + po("removeconstants", id = "glmnet_post_removeconstants") %>>% + lrn("classif.glmnet", id = "glmnet") + branches = c(branches, branch_glmnet) + } + + # kknn + if ("kknn" %in% learner_ids) { + branch_kknn = po("removeconstants", id = "kknn_removeconstants") %>>% + po("imputehist", id = "kknn_imputehist") %>>% + po("imputeoor", id = "kknn_imputeoor") %>>% + po("fixfactors", id = "kknn_fixfactors") %>>% + po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "kknn_imputesample") %>>% + po("collapsefactors", target_level_count = 100, id = "kknn_collapse") %>>% + po("removeconstants", id = "kknn_post_removeconstants") %>>% + lrn("classif.kknn", id = "kknn") + branches = c(branches, branch_kknn) + } + + # lda + if ("lda" %in% learner_ids) { + branch_lda = po("removeconstants", id = "lda_removeconstants") %>>% + po("imputehist", id = "lda_imputehist") %>>% + po("imputeoor", id = "lda_imputeoor") %>>% + po("fixfactors", id = "lda_fixfactors") %>>% + po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "lda_imputesample") %>>% + po("collapsefactors", target_level_count = 100, id = "lda_collapse") %>>% + po("removeconstants", id = "lda_post_removeconstants") %>>% + lrn("classif.lda", id = "lda") + branches = c(branches, branch_lda) + } + + # nnet + if ("nnet" %in% learner_ids) { + branch_nnet = po("removeconstants", id = "nnet_removeconstants") %>>% + po("imputehist", id = "nnet_imputehist") %>>% + po("imputeoor", id = "nnet_imputeoor") %>>% + po("fixfactors", id = "nnet_fixfactors") %>>% + po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "nnet_imputesample") %>>% + po("collapsefactors", target_level_count = 100, id = "nnet_collapse") %>>% + po("removeconstants", id = "nnet_post_removeconstants") %>>% + lrn("classif.nnet", id = "nnet") + branches = c(branches, branch_nnet) + } + + # ranger + if ("ranger" %in% learner_ids) { + branch_ranger = po("removeconstants", id = "ranger_removeconstants") %>>% + po("imputeoor", id = "ranger_imputeoor") %>>% + po("fixfactors", id = "ranger_fixfactors") %>>% + po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "ranger_imputesample") %>>% + po("collapsefactors", target_level_count = 100, id = "ranger_collapse") %>>% + po("removeconstants", id = "ranger_post_removeconstants") %>>% + # use upper bound of search space for memory estimation + lrn("classif.ranger", id = "ranger", num.trees = 2000) + branches = c(branches, branch_ranger) + } + + # svm + if ("svm" %in% learner_ids) { + branch_svm = po("removeconstants", id = "svm_removeconstants") %>>% + po("imputehist", id = "svm_imputehist") %>>% + po("imputeoor", id = "svm_imputeoor") %>>% + po("fixfactors", id = "svm_fixfactors") %>>% + po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "svm_imputesample") %>>% + po("collapsefactors", target_level_count = 100, id = "svm_collapse") %>>% + po("encode", method = "one-hot", id = "svm_encode") %>>% + po("removeconstants", id = "svm_post_removeconstants") %>>% + lrn("classif.svm", id = "svm", type = "C-classification") + branches = c(branches, branch_svm) + } + + # xgboost + if ("xgboost" %in% learner_ids) { + branch_xgboost = po("removeconstants", id = "xgboost_removeconstants") %>>% + po("imputeoor", id = "xgboost_imputeoor") %>>% + po("fixfactors", id = "xgboost_fixfactors") %>>% + po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "xgboost_imputesample") %>>% + po("encodeimpact", id = "xgboost_encode") %>>% + po("removeconstants", id = "xgboost_post_removeconstants") %>>% + lrn("classif.xgboost", id = "xgboost", nrounds = 5000, early_stopping_rounds = 10) + branches = c(branches, branch_xgboost) + } + + # catboost + if ("catboost" %in% learner_ids) { + branch_catboost = po("colapply", id = "catboost_colapply", applicator = as.numeric, affect_columns = selector_type("integer")) %>>% + lrn("classif.catboost", id = "catboost", iterations = 500, early_stopping_rounds = 10, use_best_model = TRUE) + branches = c(branches, branch_catboost) + } + + # extra trees + if ("extra_trees" %in% learner_ids) { + branch_extra_trees = po("removeconstants", id = "extra_trees_removeconstants") %>>% + po("imputeoor", id = "extra_trees_imputeoor") %>>% + po("fixfactors", id = "extra_trees_fixfactors") %>>% + po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "extra_trees_imputesample") %>>% + po("collapsefactors", target_level_count = 40, id = "extra_trees_collapse") %>>% + po("removeconstants", id = "extra_trees_post_removeconstants") %>>% + lrn("classif.ranger", id = "extra_trees", splitrule = "extratrees", num.trees = 100, replace = FALSE, sample.fraction = 1) + branches = c(branches, branch_extra_trees) + } + + # lightgbm + if ("lightgbm" %in% learner_ids) { + branch_lightgbm = lrn("classif.lightgbm", id = "lightgbm", num_iterations = 5000, early_stopping_rounds = 10) + branches = c(branches, branch_lightgbm) + } + + # branch graph + po("branch", options = learner_ids) %>>% + gunion(branches) %>>% + po("unbranch", options = learner_ids) +} + +tuning_space = list( + glmnet = list( + glmnet.s = to_tune(1e-4, 1e4, logscale = TRUE), + glmnet.alpha = to_tune(0, 1) + ), + + kknn = list( + kknn.k = to_tune(1, 50, logscale = TRUE), + kknn.distance = to_tune(1, 5), + kknn.kernel = to_tune(c("rectangular", "optimal", "epanechnikov", "biweight", "triweight", "cos", "inv", "gaussian", "rank")) + ), + + lda = list(), + + extra_trees = list(), + + nnet = list( + nnet.maxit = to_tune(1e1, 1e3, logscale = TRUE), + nnet.decay = to_tune(1e-4, 1e-1, logscale = TRUE), + nnet.size = to_tune(2, 50, logscale = TRUE) + ), + + ranger = list( + ranger.mtry.ratio = to_tune(0, 1), + ranger.replace = to_tune(), + ranger.sample.fraction = to_tune(1e-1, 1), + ranger.num.trees = to_tune(500, 2000) + ), + + svm = list( + svm.cost = to_tune(1e-4, 1e4, logscale = TRUE), + svm.kernel = to_tune(c("polynomial", "radial", "sigmoid", "linear")), + svm.degree = to_tune(2, 5), + svm.gamma = to_tune(1e-4, 1e4, logscale = TRUE) + ), + + xgboost = list( + xgboost.eta = to_tune(1e-4, 1, logscale = TRUE), + xgboost.max_depth = to_tune(1, 20), + xgboost.colsample_bytree = to_tune(1e-1, 1), + xgboost.colsample_bylevel = to_tune(1e-1, 1), + xgboost.lambda = to_tune(1e-3, 1e3, logscale = TRUE), + xgboost.alpha = to_tune(1e-3, 1e3, logscale = TRUE), + xgboost.subsample = to_tune(1e-1, 1), + xgboost.nrounds = to_tune(1, 5000, internal = TRUE) + ), + + catboost = list( + catboost.depth = to_tune(5, 8), + catboost.learning_rate = to_tune(5e-3, 0.2, logscale = TRUE), + catboost.l2_leaf_reg = to_tune(1, 5), + catboost.iterations = to_tune(1, 500, internal = TRUE) + ), + + + lightgbm = list( + lightgbm.learning_rate = to_tune(5e-3, 0.2, logscale = TRUE), + lightgbm.feature_fraction = to_tune(0.75, 1), + lightgbm.min_data_in_leaf = to_tune(2, 60), + lightgbm.num_leaves = to_tune(16, 96), + lightgbm.num_iterations = to_tune(1, 5000, internal = TRUE) + ) +) diff --git a/R/LearnerRegrAuto.R b/R/LearnerRegrAuto.R index 6d93d86..bee6e86 100644 --- a/R/LearnerRegrAuto.R +++ b/R/LearnerRegrAuto.R @@ -22,77 +22,17 @@ LearnerRegrAuto = R6Class("LearnerRegrAuto", #' @description #' Creates a new instance of this [R6][R6::R6Class] class. - initialize = function(id = "regr.auto") { - param_set = ps( - # learner - learner_ids = p_uty(default = c("glmnet", "kknn", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm"), - custom_check = function(x) { - if (length(x) == 1 && x == "extra_trees") { - return("Learner 'extra_trees' must be combined with other learners") - } - check_subset(x, c("glmnet", "kknn", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm")) - }), - learner_timeout = p_int(lower = 1L, default = 900L), - xgboost_eval_metric = p_uty(), - catboost_eval_metric = p_uty(), - lightgbm_eval_metric = p_uty(), - # system - max_nthread = p_int(lower = 1L, default = 1L), - max_memory = p_int(lower = 1L, default = 32000L), - # large data - large_data_size = p_int(lower = 1L, default = 1e6), - large_data_learner_ids = p_uty(), - large_data_nthread = p_int(lower = 1L, default = 4L), - # small data - small_data_size = p_int(lower = 1L, default = 5000L), - small_data_resampling = p_uty(), - max_cardinality = p_int(lower = 1L, default = 100L), - extra_trees_max_cardinality = p_int(lower = 1L, default = 40L), - # tuner - resampling = p_uty(), - terminator = p_uty(), - measure = p_uty(), - lhs_size = p_int(lower = 1L, default = 4L), - callbacks = p_uty(), - store_benchmark_result = p_lgl(default = FALSE)) + initialize = function(id = "classif.auto") { - param_set$set_values( - learner_ids = c("glmnet", "kknn", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm"), - learner_timeout = 900L, - max_nthread = 1L, - max_memory = 32000L, - large_data_size = 1e6L, - large_data_learner_ids = c("ranger", "xgboost", "catboost", "extra_trees", "lightgbm"), - large_data_nthread = 4L, - small_data_size = 5000L, - small_data_resampling = rsmp("cv", folds = 10L), - max_cardinality = 100L, - extra_trees_max_cardinality = 40L, - resampling = rsmp("cv", folds = 3L), - terminator = trm("run_time", secs = 14400L), - measure = msr("regr.mse"), - lhs_size = 4L, - store_benchmark_result = FALSE) - - super$initialize( - id = id, - task_type = "regr", - param_set = param_set, - packages = c("mlr3tuning", "mlr3learners", "mlr3pipelines", "mlr3mbo", "mlr3automl", "xgboost", "catboost", "lightgbm", "ranger", "nnet", "kknn", "glmnet", "e1071"), - feature_types = c("logical", "integer", "numeric", "character", "factor"), - predict_types = "response", - properties = c("missings", "weights") - ) } ), private = list( .train = function(task) { - train_auto(self, task, task_type = "regr") + }, .predict = function(task) { - lg$debug("Predicting with '%s' on task '%s'", self$id, task$id) - self$model$graph_learner$predict(task) + } ) ) diff --git a/R/build_graph.R b/R/build_graph.R deleted file mode 100644 index c863ba6..0000000 --- a/R/build_graph.R +++ /dev/null @@ -1,202 +0,0 @@ -build_graph = function(learner_ids, task_type) { - assert_choice(task_type, c("classif", "regr")) - learners_reg = c("glmnet", "kknn", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm") - if (task_type == "regr") { - assert_subset(learner_ids, learners_reg) - } else { - assert_subset(learner_ids, c(learners_reg, "lda")) - } - - branches = list() - # glmnet - if ("glmnet" %in% learner_ids) { - branch_glmnet = po("removeconstants", id = "glmnet_removeconstants") %>>% - po("imputehist", id = "glmnet_imputehist") %>>% - po("imputeoor", id = "glmnet_imputeoor") %>>% - po("fixfactors", id = "glmnet_fixfactors") %>>% - po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "glmnet_imputesample") %>>% - po("collapsefactors", target_level_count = 100, id = "glmnet_collapse") %>>% - po("encode", method = "one-hot", id = "glmnet_encode") %>>% - po("removeconstants", id = "glmnet_post_removeconstants") %>>% - lrn(paste0(task_type, ".glmnet"), id = "glmnet") - branches = c(branches, branch_glmnet) - } - - # kknn - if ("kknn" %in% learner_ids) { - branch_kknn = po("removeconstants", id = "kknn_removeconstants") %>>% - po("imputehist", id = "kknn_imputehist") %>>% - po("imputeoor", id = "kknn_imputeoor") %>>% - po("fixfactors", id = "kknn_fixfactors") %>>% - po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "kknn_imputesample") %>>% - po("collapsefactors", target_level_count = 100, id = "kknn_collapse") %>>% - po("removeconstants", id = "kknn_post_removeconstants") %>>% - lrn(paste0(task_type, ".kknn"), id = "kknn") - branches = c(branches, branch_kknn) - } - - # lda - # only for classification - if ("lda" %in% learner_ids) { - branch_lda = po("removeconstants", id = "lda_removeconstants") %>>% - po("imputehist", id = "lda_imputehist") %>>% - po("imputeoor", id = "lda_imputeoor") %>>% - po("fixfactors", id = "lda_fixfactors") %>>% - po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "lda_imputesample") %>>% - po("collapsefactors", target_level_count = 100, id = "lda_collapse") %>>% - po("removeconstants", id = "lda_post_removeconstants") %>>% - lrn("classif.lda", id = "lda") - branches = c(branches, branch_lda) - } - - # nnet - if ("nnet" %in% learner_ids) { - branch_nnet = po("removeconstants", id = "nnet_removeconstants") %>>% - po("imputehist", id = "nnet_imputehist") %>>% - po("imputeoor", id = "nnet_imputeoor") %>>% - po("fixfactors", id = "nnet_fixfactors") %>>% - po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "nnet_imputesample") %>>% - po("collapsefactors", target_level_count = 100, id = "nnet_collapse") %>>% - po("removeconstants", id = "nnet_post_removeconstants") %>>% - lrn(paste0(task_type, ".nnet"), id = "nnet") - branches = c(branches, branch_nnet) - } - - # ranger - if ("ranger" %in% learner_ids) { - branch_ranger = po("removeconstants", id = "ranger_removeconstants") %>>% - po("imputeoor", id = "ranger_imputeoor") %>>% - po("fixfactors", id = "ranger_fixfactors") %>>% - po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "ranger_imputesample") %>>% - po("collapsefactors", target_level_count = 100, id = "ranger_collapse") %>>% - po("removeconstants", id = "ranger_post_removeconstants") %>>% - # use upper bound of search space for memory estimation - lrn(paste0(task_type, ".ranger"), id = "ranger", num.trees = 2000) - branches = c(branches, branch_ranger) - } - - # svm - if ("svm" %in% learner_ids) { - svm_type = if (task_type == "classif") { - "C-classification" - } else { - "eps-regression" - } - branch_svm = po("removeconstants", id = "svm_removeconstants") %>>% - po("imputehist", id = "svm_imputehist") %>>% - po("imputeoor", id = "svm_imputeoor") %>>% - po("fixfactors", id = "svm_fixfactors") %>>% - po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "svm_imputesample") %>>% - po("collapsefactors", target_level_count = 100, id = "svm_collapse") %>>% - po("encode", method = "one-hot", id = "svm_encode") %>>% - po("removeconstants", id = "svm_post_removeconstants") %>>% - lrn(paste0(task_type, ".svm"), id = "svm", type = svm_type) - branches = c(branches, branch_svm) - } - - # xgboost - if ("xgboost" %in% learner_ids) { - branch_xgboost = po("removeconstants", id = "xgboost_removeconstants") %>>% - po("imputeoor", id = "xgboost_imputeoor") %>>% - po("fixfactors", id = "xgboost_fixfactors") %>>% - po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "xgboost_imputesample") %>>% - po("encodeimpact", id = "xgboost_encode") %>>% - po("removeconstants", id = "xgboost_post_removeconstants") %>>% - lrn(paste0(task_type, ".xgboost"), id = "xgboost", nrounds = 5000, early_stopping_rounds = 10) - branches = c(branches, branch_xgboost) - } - - # catboost - if ("catboost" %in% learner_ids) { - branch_catboost = po("colapply", id = "catboost_colapply", applicator = as.numeric, affect_columns = selector_type("integer")) %>>% - lrn(paste0(task_type, ".catboost"), id = "catboost", iterations = 500, early_stopping_rounds = 10, use_best_model = TRUE) - branches = c(branches, branch_catboost) - } - - # extra trees - if ("extra_trees" %in% learner_ids) { - branch_extra_trees = po("removeconstants", id = "extra_trees_removeconstants") %>>% - po("imputeoor", id = "extra_trees_imputeoor") %>>% - po("fixfactors", id = "extra_trees_fixfactors") %>>% - po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "extra_trees_imputesample") %>>% - po("collapsefactors", target_level_count = 40, id = "extra_trees_collapse") %>>% - po("removeconstants", id = "extra_trees_post_removeconstants") %>>% - lrn(paste0(task_type, ".ranger"), id = "extra_trees", splitrule = "extratrees", num.trees = 100, replace = FALSE, sample.fraction = 1) - branches = c(branches, branch_extra_trees) - } - - # lightgbm - if ("lightgbm" %in% learner_ids) { - branch_lightgbm = lrn(paste0(task_type, ".lightgbm"), id = "lightgbm", num_iterations = 5000, early_stopping_rounds = 10) - branches = c(branches, branch_lightgbm) - } - - # branch graph - po("branch", options = learner_ids) %>>% - gunion(branches) %>>% - po("unbranch", options = learner_ids) -} - -tuning_space = list( - glmnet = list( - glmnet.s = to_tune(1e-4, 1e4, logscale = TRUE), - glmnet.alpha = to_tune(0, 1) - ), - - kknn = list( - kknn.k = to_tune(1, 50, logscale = TRUE), - kknn.distance = to_tune(1, 5), - kknn.kernel = to_tune(c("rectangular", "optimal", "epanechnikov", "biweight", "triweight", "cos", "inv", "gaussian", "rank")) - ), - - lda = list(), - - extra_trees = list(), - - nnet = list( - nnet.maxit = to_tune(1e1, 1e3, logscale = TRUE), - nnet.decay = to_tune(1e-4, 1e-1, logscale = TRUE), - nnet.size = to_tune(2, 50, logscale = TRUE) - ), - - ranger = list( - ranger.mtry.ratio = to_tune(0, 1), - ranger.replace = to_tune(), - ranger.sample.fraction = to_tune(1e-1, 1), - ranger.num.trees = to_tune(500, 2000) - ), - - svm = list( - svm.cost = to_tune(1e-4, 1e4, logscale = TRUE), - svm.kernel = to_tune(c("polynomial", "radial", "sigmoid", "linear")), - svm.degree = to_tune(2, 5), - svm.gamma = to_tune(1e-4, 1e4, logscale = TRUE) - ), - - xgboost = list( - xgboost.eta = to_tune(1e-4, 1, logscale = TRUE), - xgboost.max_depth = to_tune(1, 20), - xgboost.colsample_bytree = to_tune(1e-1, 1), - xgboost.colsample_bylevel = to_tune(1e-1, 1), - xgboost.lambda = to_tune(1e-3, 1e3, logscale = TRUE), - xgboost.alpha = to_tune(1e-3, 1e3, logscale = TRUE), - xgboost.subsample = to_tune(1e-1, 1), - xgboost.nrounds = to_tune(1, 5000, internal = TRUE) - ), - - catboost = list( - catboost.depth = to_tune(5, 8), - catboost.learning_rate = to_tune(5e-3, 0.2, logscale = TRUE), - catboost.l2_leaf_reg = to_tune(1, 5), - catboost.iterations = to_tune(1, 500, internal = TRUE) - ), - - - lightgbm = list( - lightgbm.learning_rate = to_tune(5e-3, 0.2, logscale = TRUE), - lightgbm.feature_fraction = to_tune(0.75, 1), - lightgbm.min_data_in_leaf = to_tune(2, 60), - lightgbm.num_leaves = to_tune(16, 96), - lightgbm.num_iterations = to_tune(1, 5000, internal = TRUE) - ) -) \ No newline at end of file diff --git a/R/train_auto.R b/R/train_auto.R deleted file mode 100644 index 2ff174f..0000000 --- a/R/train_auto.R +++ /dev/null @@ -1,155 +0,0 @@ -train_auto = function(self, task, task_type) { - pv = self$param_set$values - learner_ids = pv$learner_ids - self$graph = build_graph(learner_ids, task_type) - self$tuning_space = tuning_space[learner_ids] - - lg$debug("Training '%s' on task '%s'", self$id, task$id) - - # initialize mbo tuner - tuner = tnr("adbo") - - # remove learner based on memory limit - lg$debug("Starting to select from %i learners: %s", length(learner_ids), paste0(learner_ids, collapse = ",")) - - if (!is.null(pv$max_memory)) { - memory_usage = map_dbl(learner_ids, function(learner_id) { - self$graph$pipeops[[learner_id]]$learner$estimate_memory_usage(task) / 1e6 - }) - learner_ids = learner_ids[memory_usage < pv$max_memory] - lg$debug("Checking learners for memory limit of %i MB. Keeping %i learner(s): %s", pv$max_memory, length(learner_ids), paste0(learner_ids, collapse = ",")) - } - - # set number of threads - if (!is.null(pv$max_nthread)) { - lg$debug("Setting number of threads per learner to %i", pv$max_nthread) - walk(learner_ids, function(learner_id) { - set_threads(self$graph$pipeops[[learner_id]]$learner, pv$max_nthread) - }) - } - - # reduce number of workers on large data sets - if (!is.null(pv$large_data_size) && task$nrow * task$ncol > pv$large_data_size) { - lg$debug("Task size larger than %i rows", pv$large_data_size) - - learner_ids = intersect(learner_ids, pv$large_data_learner_ids) - self$tuning_space = tuning_space[learner_ids] - lg$debug("Keeping %i learner(s): %s", length(learner_ids), paste0(learner_ids, collapse = ",")) - - lg$debug("Increasing number of threads per learner to %i", pv$large_data_nthread) - walk(learner_ids, function(learner_id) { - set_threads(self$graph$pipeops[[learner_id]]$learner, pv$large_data_nthread) - }) - n_workers = rush_config()$n_workers - n = max(1, floor(n_workers / pv$large_data_nthread)) - tuner$param_set$set_values(n_workers = n) - lg$debug("Reducing number of workers to %i", n) - } - - # small data resampling - resampling = if (!is.null(pv$small_data_size) && task$nrow < pv$small_data_size) { - lg$debug("Task has less than %i rows", pv$small_data_size) - lg$debug("Using small data set resampling with %i iterations", pv$small_data_resampling$iters) - pv$small_data_resampling - } else { - pv$resampling - } - - # cardinality - cardinality = map_int(task$col_info$levels, length) - if (!is.null(pv$max_cardinality) && any(cardinality > pv$max_cardinality)) { - lg$debug("Reducing number of factor levels to %i", pv$max_cardinality) - - # collapse factors - pipeop_ids = names(self$graph$pipeops) - pipeop_ids = pipeop_ids[grep("collapse", pipeop_ids)] - walk(pipeop_ids, function(pipeop_id) { - self$graph$pipeops[[pipeop_id]]$param_set$values$target_level_count = pv$max_cardinality - }) - } - - if ("extra_trees" %in% learner_ids && any(cardinality > pv$extra_trees_max_cardinality)) { - lg$debug("Reducing number of factor levels to %i for extra trees", pv$extra_trees_max_cardinality) - self$graph$pipeops$extra_trees_collapse$param_set$values$target_level_count = pv$extra_trees_max_cardinality - } - - # initialize graph learner - graph_learner = as_learner(self$graph) - graph_learner$id = "graph_learner" - graph_learner$predict_type = pv$measure$predict_type - graph_learner$fallback = lrn(paste0(task_type, ".featureless"), predict_type = pv$measure$predict_type) - graph_learner$encapsulate = c(train = "callr", predict = "callr") - graph_learner$timeout = c(train = pv$learner_timeout, predict = pv$learner_timeout) - - learners_with_validation = intersect(learner_ids, c("xgboost", "catboost", "lightgbm")) - if (length(learners_with_validation)) { - set_validate(graph_learner, "test", ids = learners_with_validation) - } - - # set early stopping - if ("xgboost" %in% learner_ids) { - graph_learner$param_set$values$xgboost.callbacks = list(cb_timeout_xgboost(pv$learner_timeout * 0.8)) - graph_learner$param_set$values$xgboost.eval_metric = pv$xgboost_eval_metric - } - if ("catboost" %in% learner_ids) { - graph_learner$param_set$values$catboost.eval_metric = pv$catboost_eval_metric - } - if ("lightgbm" %in% learner_ids) { - graph_learner$param_set$values$lightgbm.callbacks = list(cb_timeout_lightgbm(pv$learner_timeout * 0.8)) - graph_learner$param_set$values$lightgbm.eval = pv$lightgbm_eval_metric - } - - # initialize search space - tuning_space = unlist(unname(self$tuning_space), recursive = FALSE) - graph_scratch = graph_learner$clone(deep = TRUE) - graph_scratch$param_set$set_values(.values = tuning_space) - graph_scratch$param_set$set_values(branch.selection = to_tune(learner_ids)) - search_space = graph_scratch$param_set$search_space() - walk(learner_ids, function(learner_id) { - param_ids = search_space$ids() - param_ids = grep(paste0("^", learner_id), param_ids, value = TRUE) - walk(param_ids, function(param_id) { - # skip internal tuning parameter - if (param_id %in% c("xgboost.nrounds", "catboost.iterations", "lightgbm.num_iterations")) return() - search_space$add_dep( - id = param_id, - on = "branch.selection", - cond = CondEqual$new(learner_id) - ) - }) - }) - - # initial design - lhs_xdt = generate_lhs_design(pv$lhs_size, self$task_type, setdiff(learner_ids, c("lda", "extra_trees")), self$tuning_space) - default_xdt = generate_default_design(self$task_type, learner_ids, task, self$tuning_space) - initial_xdt = rbindlist(list(lhs_xdt, default_xdt), use.names = TRUE, fill = TRUE) - setorderv(initial_xdt, "branch.selection") - tuner$param_set$set_values(initial_design = initial_xdt) - - # initialize auto tuner - self$instance = ti_async( - task = task, - learner = graph_learner, - resampling = resampling, - measures = pv$measure, - terminator = pv$terminator, - search_space = search_space, - callbacks = pv$callbacks, - store_benchmark_result = pv$store_benchmark_result - ) - - # tune - lg$debug("Learner '%s' starts tuning phase", self$id) - tuner$optimize(self$instance) - - # fit final model - lg$debug("Learner '%s' fits final model", self$id) - if (length(learners_with_validation)) { - set_validate(graph_learner, NULL, ids = intersect(learner_ids, c("xgboost", "catboost", "lightgbm"))) - } - graph_learner$param_set$set_values(.values = self$instance$result_learner_param_vals, .insert = FALSE) - graph_learner$timeout = c(train = Inf, predict = Inf) - graph_learner$train(task) - - list(graph_learner = graph_learner, instance = self$instance) -} \ No newline at end of file diff --git a/tests/testthat/test_LearnerClassifAuto.R b/tests/testthat/test_LearnerClassifAuto.R index 4ea0f26..7eebf33 100644 --- a/tests/testthat/test_LearnerClassifAuto.R +++ b/tests/testthat/test_LearnerClassifAuto.R @@ -211,7 +211,7 @@ test_that("extra_trees and glmnet works", { ) expect_class(learner$train(task), "LearnerClassifAuto") - expect_choice(learner$model$instance$result$branch.selection, c("extra_trees", "glmnet")) + expect_equal(learner$model$instance$result$branch.selection, "extra_trees") }) test_that("lightgbm works", { diff --git a/tests/testthat/test_LearnerRegrAuto.R b/tests/testthat/test_LearnerRegrAuto.R deleted file mode 100644 index 4b274cc..0000000 --- a/tests/testthat/test_LearnerRegrAuto.R +++ /dev/null @@ -1,296 +0,0 @@ -test_that("glmnet works (regr)", { - rush_plan(n_workers = 2) - skip_if_not_installed("glmnet") - - task = tsk("boston_housing") - learner = lrn("regr.auto", - learner_ids = "glmnet", - small_data_size = 1, - resampling = rsmp("holdout"), - measure = msr("regr.mse"), - terminator = trm("evals", n_evals = 6) - ) - - expect_class(learner$train(task), "LearnerRegrAuto") - expect_equal(learner$graph$param_set$values$branch.selection, "glmnet") - expect_equal(learner$model$instance$result$branch.selection, "glmnet") -}) - -test_that("kknn works (regr)", { - rush_plan(n_workers = 2) - skip_if_not_installed("kknn") - - task = tsk("boston_housing") - learner = lrn("regr.auto", - learner_ids = "kknn", - small_data_size = 1, - resampling = rsmp("holdout"), - measure = msr("regr.mse"), - terminator = trm("evals", n_evals = 6) - ) - - expect_class(learner$train(task), "LearnerRegrAuto") - expect_equal(learner$graph$param_set$values$branch.selection, "kknn") - expect_equal(learner$model$instance$result$branch.selection, "kknn") -}) - -test_that("nnet works (regr)", { - rush_plan(n_workers = 2) - skip_if_not_installed("nnet") - - task = tsk("boston_housing") - learner = lrn("regr.auto", - learner_ids = "nnet", - resampling = rsmp("holdout"), - small_data_size = 1, - measure = msr("regr.mse"), - terminator = trm("evals", n_evals = 6) - ) - - expect_class(learner$train(task), "LearnerRegrAuto") - expect_equal(learner$model$instance$result$branch.selection, "nnet") -}) - -test_that("ranger works (regr)", { - rush_plan(n_workers = 2) - skip_if_not_installed("ranger") - - - task = tsk("boston_housing") - learner = lrn("regr.auto", - learner_ids = "ranger", - small_data_size = 1, - resampling = rsmp("holdout"), - measure = msr("regr.mse"), - terminator = trm("evals", n_evals = 6) - ) - - expect_class(learner$train(task), "LearnerRegrAuto") - expect_equal(learner$model$instance$result$branch.selection, "ranger") -}) - -test_that("svm works (regr)", { - rush_plan(n_workers = 2) - skip_if_not_installed("e1071") - - - task = tsk("boston_housing") - learner = lrn("regr.auto", - learner_ids = "svm", - small_data_size = 1, - resampling = rsmp("holdout"), - measure = msr("regr.mse"), - terminator = trm("evals", n_evals = 6) - ) - - expect_class(learner$train(task), "LearnerRegrAuto") - expect_equal(learner$model$instance$result$branch.selection, "svm") -}) - -test_that("xgboost works (regr)", { - skip_if_not_installed("xgboost") - rush_plan(n_workers = 2) - - task = tsk("boston_housing") - learner = lrn("regr.auto", - learner_ids = "xgboost", - small_data_size = 1, - xgboost_eval_metric = "mlogloss", - resampling = rsmp("holdout"), - measure = msr("regr.mse"), - terminator = trm("evals", n_evals = 6) - ) - - expect_class(learner$train(task), "LearnerRegrAuto") - expect_equal(learner$model$instance$result$branch.selection, "xgboost") -}) - -test_that("catboost works (regr)", { - skip_if_not_installed("catboost") - rush_plan(n_workers = 2) - - task = tsk("boston_housing") - learner = lrn("regr.auto", - learner_ids = "catboost", - small_data_size = 1, - # catboost_eval_metric = "MultiClass", - resampling = rsmp("holdout"), - measure = msr("regr.mse"), - terminator = trm("evals", n_evals = 6) - ) - - expect_class(learner$train(task), "LearnerRegrAuto") - expect_equal(learner$model$instance$result$branch.selection, "catboost") -}) - -test_that("only extra_trees fails", { - rush_plan(n_workers = 2) - - task = tsk("boston_housing") - expect_error(lrn("regr.auto", - learner_ids = "extra_trees", - resampling = rsmp("holdout"), - measure = msr("regr.mse"), - terminator = trm("evals", n_evals = 6) - ), "must be combined with other learners") -}) - -test_that("extra_trees and glmnet works (regr)", { - skip_if_not_installed("glmnet") - rush_plan(n_workers = 2) - - task = tsk("boston_housing") - learner = lrn("regr.auto", - learner_ids = c("extra_trees", "glmnet"), - resampling = rsmp("holdout"), - measure = msr("regr.mse"), - terminator = trm("evals", n_evals = 6) - ) - - expect_class(learner$train(task), "LearnerRegrAuto") - expect_choice(learner$model$instance$result$branch.selection, c("extra_trees", "glmnet")) -}) - -test_that("lightgbm works (regr)", { - skip_if_not_installed("lightgbm") - rush_plan(n_workers = 2) - - task = tsk("boston_housing") - learner = lrn("regr.auto", - learner_ids = "lightgbm", - lightgbm_eval_metric = "multi_logloss", - resampling = rsmp("holdout"), - measure = msr("regr.mse"), - terminator = trm("evals", n_evals = 6) - ) - - expect_class(learner$train(task), "LearnerRegrAuto") - expect_equal(learner$model$instance$result$branch.selection, "lightgbm") -}) - -test_that("xgboost, catboost and lightgbm work (regr)", { - skip_if_not_installed(c("xgboost", "catboost", "lightgbm")) - rush_plan(n_workers = 2) - - task = tsk("boston_housing") - learner = lrn("regr.auto", - learner_ids = c("xgboost", "catboost", "lightgbm"), - # catboost_eval_metric = "MultiClass", - # lightgbm_eval_metric = "multi_logloss", - # xgboost_eval_metric = "mlogloss", - resampling = rsmp("holdout"), - lhs_size = 1, - measure = msr("regr.mse"), - terminator = trm("evals", n_evals = 20), - callbacks = clbk("mlr3tuning.async_save_logs") - ) - - expect_class(learner$train(task), "LearnerRegrAuto") -}) - -test_that("all learner work (regr)", { - skip_if_not_installed(c("glmnet", "kknn", "nnet", "ranger", "e1071", "xgboost", "catboost", "MASS", "lightgbm")) - rush_plan(n_workers = 2) - - task = tsk("boston_housing") - learner = lrn("regr.auto", - small_data_size = 100, - measure = msr("regr.mse"), - terminator = trm("evals", n_evals = 20), - lhs_size = 1 - ) - - expect_class(learner$train(task), "LearnerRegrAuto") - expect_class(learner$model$instance, "TuningInstanceAsyncSingleCrit") - expect_prediction(learner$predict(task)) -}) - -# test_that("memory limit works", { -# skip_if_not_installed(c("glmnet", "kknn", "nnet", "ranger", "e1071", "xgboost", "catboost", "MASS", "lightgbm")) -# rush_plan(n_workers = 2) - -# task = tsk("spam") -# learner = lrn("regr.auto", -# max_memory = 50, -# small_data_size = 100, -# measure = msr("regr.mse"), -# terminator = trm("evals", n_evals = 20), -# resampling = rsmp("holdout"), -# lhs_size = 1 -# ) - -# learner$train(task) -# }) - -test_that("small data set switch works (regr)", { - skip_if_not_installed(c("glmnet", "kknn", "nnet", "ranger", "e1071", "xgboost", "catboost", "lightgbm")) - rush_plan(n_workers = 2) - - task = tsk("boston_housing") - learner = lrn("regr.auto", - small_data_size = 1000, - small_data_resampling = rsmp("cv", folds = 2), - measure = msr("regr.mse"), - terminator = trm("evals", n_evals = 1), - lhs_size = 1, - store_benchmark_result = TRUE - ) - - expect_class(learner$train(task), "LearnerRegrAuto") - expect_equal(learner$model$instance$archive$benchmark_result$resamplings$resampling[[1]]$iters, 2) -}) - -test_that("large data set switch works (regr)", { - skip_if_not_installed(c("glmnet", "kknn", "nnet", "ranger", "e1071", "xgboost", "catboost", "lightgbm")) - rush_plan(n_workers = 2) - - task = tsk("boston_housing") - learner = lrn("regr.auto", - large_data_size = 100, - large_data_nthread = 4, - large_data_learner_ids = "ranger", - small_data_size = 100, - measure = msr("regr.mse"), - terminator = trm("evals", n_evals = 1), - lhs_size = 1, - store_benchmark_result = TRUE - ) - - expect_class(learner$train(task), "LearnerRegrAuto") - expect_set_equal(learner$model$instance$archive$data$branch.selection, "ranger") -}) - -test_that("max_cardinality works (regr)", { - skip_if_not_installed(c("glmnet", "kknn", "nnet", "ranger", "e1071", "xgboost", "catboost", "lightgbm")) - rush_plan(n_workers = 2) - - task = tsk("boston_housing") - learner = lrn("regr.auto", - small_data_size = 1, - resampling = rsmp("holdout"), - max_cardinality = 2, - measure = msr("regr.mse"), - terminator = trm("evals", n_evals = 1), - lhs_size = 1 - ) - - expect_class(learner$train(task), "LearnerRegrAuto") -}) - -test_that("max_cardinality works for extra trees (regr)", { - skip_if_not_installed(c("glmnet", "kknn", "nnet", "ranger", "e1071", "xgboost", "catboost", "lightgbm")) - rush_plan(n_workers = 2) - - task = tsk("boston_housing") - learner = lrn("regr.auto", - small_data_size = 1, - resampling = rsmp("holdout"), - max_cardinality = 3, - extra_trees_max_cardinality = 2, - measure = msr("regr.mse"), - terminator = trm("evals", n_evals = 1), - lhs_size = 1 - ) - - expect_class(learner$train(task), "LearnerRegrAuto") -}) From ae13f949d8259eae5f2ec91b7d3432d070ec1d52 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Tue, 13 Aug 2024 14:34:20 +0200 Subject: [PATCH 22/34] chore: collate --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 938ca9c..ae3d58a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -62,5 +62,6 @@ Collate: 'LearnerClassifAutoSVM.R' 'LearnerClassifAutoXgboost.R' 'LearnerRegrAuto.R' + 'save_deepcave_run.R' 'helper.R' 'zzz.R' From f3bca1e0872d82a6b3ef9b8cd08b2e63261eda15 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Tue, 13 Aug 2024 15:18:03 +0200 Subject: [PATCH 23/34] fix: global variables --- R/save_deepcave_run.R | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R index cbba48f..500dcca 100644 --- a/R/save_deepcave_run.R +++ b/R/save_deepcave_run.R @@ -96,7 +96,7 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", # Prepare the list for converting to `configs.json` get_configs = function(instance){ - param_ids = instance$search_space$data[, id] + param_ids = instance$search_space$data$id logscale_params = param_ids[instance$search_space$is_logscale[param_ids]] config_table = instance$archive$data[, param_ids, with = FALSE] config_table[, (logscale_params) := lapply(.SD, exp), .SDcols = logscale_params] @@ -113,9 +113,10 @@ get_configs = function(instance){ # Prepare the list for converting to `configspace.json` get_configspace = function(instance) { n_params = nrow(instance$search_space$data) - param_ids = instance$search_space$data[, id] + param_ids = instance$search_space$data$id hyperparameters_list = map(param_ids, function(param_id) { + id = NULL # resolve global variable note in R CDM check row = instance$search_space$data[id == param_id, ] type = switch(row[["class"]], @@ -160,11 +161,13 @@ get_configspace = function(instance) { }) conditions_list = map(setdiff(param_ids, "branch.selection"), function(param_id) { + id = NULL # resolve global variable note in R CDM check dependency = instance$search_space$deps[id == param_id, ] # `svm.degree` and `svm.gamma` depends on `svm.kernel` as well as `branch.selection`. # DeepCAVE does not allow one parameter to be conditioned on multiple others. # So remove their dependency on `branch.selection`. if (nrow(dependency) > 1) { + on = NULL # resolve global variable note in R CDM check dependency = dependency[on != "branch.selection", ] } child = param_id @@ -174,7 +177,7 @@ get_configspace = function(instance) { # Currently, there are only 'CondEqual' and 'CondAnyOf', which should not be used simultaneously. # So this list should always contain only one entry. cond = dependency[["cond"]][[1]] - if (is(cond, "CondEqual")) { + if (class(cond)[[1]] == "CondEqual") { return(list(child = child, parent = parent, type = "EQ", value = cond$rhs)) } return(list(child = child, parent = parent, type = "IN", values = cond$rhs)) @@ -189,10 +192,11 @@ get_configspace = function(instance) { # Prepare the data.table for converting to `history.jsonl` get_history = function(instance) { - costs = c(instance$objective$codomain$data[, id], "runtime_learners") + costs = c(instance$objective$codomain$data$id, "runtime_learners") selected_cols = c(costs, "timestamp_xs", "timestamp_ys", "state") - history_table = instance$archive$data[, ..selected_cols][, .( + timestamp_xs = timestamp_ys = state = NULL # resolve global variable note in R CDM check + history_table = instance$archive$data[, selected_cols, with = FALSE][, list( config_id = seq_len(nrow(instance$archive$data)) - 1, budget = 0, seed = -1, @@ -216,7 +220,7 @@ get_history = function(instance) { # Prepare the list for converting to 'meta.json' get_meta = function(instance){ # time is handled separately below - costs = instance$objective$codomain$data[, id] + costs = instance$objective$codomain$data$id objectives_list = map(costs, function(cost) { measure = msr(cost) @@ -225,7 +229,7 @@ get_meta = function(instance){ if (is.finite(lower)) { lock_lower = TRUE } else { - lower = min(instance$archive$data[, ..cost]) + lower = min(instance$archive$data[, cost, with = FALSE]) lock_lower = FALSE } @@ -233,7 +237,7 @@ get_meta = function(instance){ if (is.finite(upper)) { lock_upper = TRUE } else { - upper = max(instance$archive$data[, ..cost]) + upper = max(instance$archive$data[, cost, with = FALSE]) lock_upper = FALSE } @@ -250,7 +254,7 @@ get_meta = function(instance){ objectives_list = c(objectives_list, list(list( name = "time", lower = 0, - upper = max(instance$archive$data[, runtime_learners]), + upper = max(instance$archive$data[, "runtime_learners", with = FALSE]), lock_lower = TRUE, lock_upper = FALSE, optimize = "lower" From c306d7c3465884dcc261d82b374200c87b1952f3 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Tue, 13 Aug 2024 15:22:24 +0200 Subject: [PATCH 24/34] test: no overwrite --- tests/testthat/test_save_deepcave_run.R | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test_save_deepcave_run.R b/tests/testthat/test_save_deepcave_run.R index d24cd47..ffc0717 100644 --- a/tests/testthat/test_save_deepcave_run.R +++ b/tests/testthat/test_save_deepcave_run.R @@ -1,4 +1,4 @@ -test_that("run is saved", { +test_that("run is saved without overwriting", { rush_plan(n_workers = 2) skip_if_not_installed("e1071") @@ -13,7 +13,13 @@ test_that("run is saved", { learner$train(task) dir = tempdir() - expected_path = file.path(dir, "test-1-run_1") + prefix = "test-1-run" + previous_run = file.path(dir, prefix, "run_1") + if (!file.exists(previous_run)) { + dir.create(previous_run) + } + + expected_path = file.path(dir, "test-1-run_2") if (file.exists(expected_path)) { lapply(list.files(expected_path, full.names = TRUE), file.remove) file.remove(expected_path) From 3bb2d9c80dccba01eaf605d90d4575cd79fac7a6 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Tue, 13 Aug 2024 15:39:19 +0200 Subject: [PATCH 25/34] fix: save without overwriting --- R/save_deepcave_run.R | 16 +++++++++++----- tests/testthat/test_save_deepcave_run.R | 15 ++++++++------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R index 500dcca..ee06c4c 100644 --- a/R/save_deepcave_run.R +++ b/R/save_deepcave_run.R @@ -28,18 +28,24 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", # (https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/recorder.py) if (!overwrite) { new_idx = 0 - walk(list.files(path), function(fn) { - if (!startsWith(fn, "prefix")) return() - idx = last(strsplit(fn, "_")[[1]]) - if (is.numeric(idx)) { + for (fn in list.files(path)) { + if (!startsWith(fn, prefix)) next + + splitted = strsplit(fn, "_")[[1]] + if (length(splitted) == 1) next # no run index attached + + idx = suppressWarnings(last(splitted)) + if (!is.na(idx)) { # idx is successfully coerced to a number idx_int = as.integer(idx) if (idx_int > new_idx) { new_idx = idx_int } } - }) + } + new_idx = new_idx + 1 run_path = file.path(path, paste0(prefix, "_", new_idx)) + dir.create(run_path) } else { run_path = file.path(path, prefix) diff --git a/tests/testthat/test_save_deepcave_run.R b/tests/testthat/test_save_deepcave_run.R index ffc0717..0460c2d 100644 --- a/tests/testthat/test_save_deepcave_run.R +++ b/tests/testthat/test_save_deepcave_run.R @@ -1,4 +1,12 @@ test_that("run is saved without overwriting", { + dir = tempdir() + prefix = "test-1-run" + dir.create(file.path(dir, prefix)) + previous_run = file.path(dir, prefix, "run_1") + if (!file.exists(previous_run)) { + dir.create(previous_run) + } + rush_plan(n_workers = 2) skip_if_not_installed("e1071") @@ -11,13 +19,6 @@ test_that("run is saved without overwriting", { terminator = trm("evals", n_evals = 6) ) learner$train(task) - - dir = tempdir() - prefix = "test-1-run" - previous_run = file.path(dir, prefix, "run_1") - if (!file.exists(previous_run)) { - dir.create(previous_run) - } expected_path = file.path(dir, "test-1-run_2") if (file.exists(expected_path)) { From 05f6587bf3e3b70d56695c3690f2d606a20989a1 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Tue, 13 Aug 2024 15:41:43 +0200 Subject: [PATCH 26/34] build: update --- DESCRIPTION | 2 +- NAMESPACE | 1 + man/save_deepcave_run.Rd | 29 +++++++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 man/save_deepcave_run.Rd diff --git a/DESCRIPTION b/DESCRIPTION index ae3d58a..b3e3839 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -62,6 +62,6 @@ Collate: 'LearnerClassifAutoSVM.R' 'LearnerClassifAutoXgboost.R' 'LearnerRegrAuto.R' - 'save_deepcave_run.R' 'helper.R' + 'save_deepcave_run.R' 'zzz.R' diff --git a/NAMESPACE b/NAMESPACE index 6fc0f11..14bb580 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ export(LearnerClassifAuto) export(LearnerClassifAutoSVM) export(LearnerClassifAutoXgboost) export(LearnerRegrAuto) +export(save_deepcave_run) import(R6) import(checkmate) import(data.table) diff --git a/man/save_deepcave_run.Rd b/man/save_deepcave_run.Rd new file mode 100644 index 0000000..ab37cc8 --- /dev/null +++ b/man/save_deepcave_run.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/save_deepcave_run.R +\name{save_deepcave_run} +\alias{save_deepcave_run} +\title{Save Tuning History as a DeepCAVE Run} +\usage{ +save_deepcave_run( + instance, + path = "logs/mlr3automl", + prefix = "run", + overwrite = FALSE +) +} +\arguments{ +\item{instance}{(\link{TuningInstanceAsyncSingleCrit}) +Tuning instance to save.} + +\item{path}{(\code{character(1)}) +Path to save the run. Defaults to \verb{"logs/mlr3automl}.} + +\item{prefix}{(\code{character(1)}) +Prefix for the name of a new subfolder under \code{path} for storing the current run.} + +\item{overwrite}{(\code{character(1)}) +If \code{FALSE} (default), creates a new subfolder to save the current run. If \code{TRUE}, all existing runs will be deleted.} +} +\description{ +Exports information stored in a \code{TuningInstance} in a format recognized by \href{https://automl.github.io/DeepCAVE/main/index.html}{DeepCAVE} as a run. Each run is stored as a folder containing five files \code{configs.json}, \code{configspace.json}, \code{history.jsonl}, \code{meta.json}, and \code{origins.json}. +} From 747b7050052656f00f5d04ad6bf0ea5c0d4d63d4 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Tue, 13 Aug 2024 16:26:44 +0200 Subject: [PATCH 27/34] fix: save path --- R/save_deepcave_run.R | 4 ++-- tests/testthat/test_save_deepcave_run.R | 9 +++------ 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R index ee06c4c..f62dc76 100644 --- a/R/save_deepcave_run.R +++ b/R/save_deepcave_run.R @@ -60,7 +60,7 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", # `configspace.json` jsonlite::write_json( get_configspace(instance), - paste0(run_path, "/configspace.json"), + file.path(run_path, "configspace.json"), auto_unbox = TRUE, pretty = TRUE, null = "null" ) @@ -94,7 +94,7 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", names(origins) = seq(nrow(instance$archive$data)) - 1 jsonlite::write_json( origins, - paste0(run_path, "/origins.json"), + file.path(run_path, "origins.json"), pretty = TRUE, null = "null" ) } diff --git a/tests/testthat/test_save_deepcave_run.R b/tests/testthat/test_save_deepcave_run.R index 0460c2d..4a3f85b 100644 --- a/tests/testthat/test_save_deepcave_run.R +++ b/tests/testthat/test_save_deepcave_run.R @@ -1,11 +1,8 @@ test_that("run is saved without overwriting", { dir = tempdir() - prefix = "test-1-run" - dir.create(file.path(dir, prefix)) - previous_run = file.path(dir, prefix, "run_1") - if (!file.exists(previous_run)) { - dir.create(previous_run) - } + + previous_run = file.path(dir, "test-1-run_1") + dir.create(previous_run, showWarnings = FALSE) rush_plan(n_workers = 2) skip_if_not_installed("e1071") From 9e0739cb712a29e4146bb92eef878f85e97e8b38 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Fri, 16 Aug 2024 13:05:28 +0200 Subject: [PATCH 28/34] fix: typo --- R/save_deepcave_run.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R index f62dc76..8b3b6c1 100644 --- a/R/save_deepcave_run.R +++ b/R/save_deepcave_run.R @@ -140,7 +140,7 @@ get_configspace = function(instance) { choices = choices, # FIXME: `default` is wrong default = choices[[1]], - probabilisties = NULL + probabilities = NULL )) } From ea05c84b12af2d622fdf5add97fd515ab7b4ba9f Mon Sep 17 00:00:00 2001 From: b-zhou Date: Fri, 16 Aug 2024 14:29:51 +0200 Subject: [PATCH 29/34] fix: default --- R/save_deepcave_run.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R index 8b3b6c1..9583987 100644 --- a/R/save_deepcave_run.R +++ b/R/save_deepcave_run.R @@ -149,7 +149,7 @@ get_configspace = function(instance) { lower = row[["lower"]] upper = row[["upper"]] # FIXME: default is wrong - default = mean(lower, upper) + default = lower if (is_logscale) { lower = exp(lower) upper = exp(upper) From f44b4c990a852fb3a59a1f73e7d211261b84729b Mon Sep 17 00:00:00 2001 From: b-zhou Date: Sun, 18 Aug 2024 22:46:18 +0200 Subject: [PATCH 30/34] fix: remove time objective --- R/save_deepcave_run.R | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R index 9583987..7e396b0 100644 --- a/R/save_deepcave_run.R +++ b/R/save_deepcave_run.R @@ -83,7 +83,7 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", jsonlite::stream_out( get_history(instance), con, - auto_unbox = TRUE, pretty = TRUE, null = "list", na = "null", + auto_unbox = FALSE, pretty = TRUE, null = "list", na = "null", dataframe = "values", verbose = FALSE ) @@ -198,7 +198,7 @@ get_configspace = function(instance) { # Prepare the data.table for converting to `history.jsonl` get_history = function(instance) { - costs = c(instance$objective$codomain$data$id, "runtime_learners") + costs = instance$objective$codomain$data$id selected_cols = c(costs, "timestamp_xs", "timestamp_ys", "state") timestamp_xs = timestamp_ys = state = NULL # resolve global variable note in R CDM check @@ -225,7 +225,6 @@ get_history = function(instance) { # Prepare the list for converting to 'meta.json' get_meta = function(instance){ - # time is handled separately below costs = instance$objective$codomain$data$id objectives_list = map(costs, function(cost) { @@ -256,15 +255,6 @@ get_meta = function(instance){ return(list(name = cost, lower = lower, upper = upper, lock_lower = lock_lower, lock_upper = lock_upper, optimize = optimize)) }) - - objectives_list = c(objectives_list, list(list( - name = "time", - lower = 0, - upper = max(instance$archive$data[, "runtime_learners", with = FALSE]), - lock_lower = TRUE, - lock_upper = FALSE, - optimize = "lower" - ))) return(list( objectives = objectives_list, From 0c3005aaf9d40108a7cd2b84403225641c146fe9 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Sun, 18 Aug 2024 23:13:20 +0200 Subject: [PATCH 31/34] feat: skip branch.selection if only one branch --- R/save_deepcave_run.R | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R index 7e396b0..8e8809e 100644 --- a/R/save_deepcave_run.R +++ b/R/save_deepcave_run.R @@ -103,8 +103,14 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", # Prepare the list for converting to `configs.json` get_configs = function(instance){ param_ids = instance$search_space$data$id - logscale_params = param_ids[instance$search_space$is_logscale[param_ids]] + # skip branch.selection if there is only one level + nbranches = instance$search_space$data[id == "branch.selection", "nlevels", with = FALSE] + if (nbranches == 1) { + param_ids = setdiff(param_ids, "branch.selection") + } + config_table = instance$archive$data[, param_ids, with = FALSE] + logscale_params = param_ids[instance$search_space$is_logscale[param_ids]] config_table[, (logscale_params) := lapply(.SD, exp), .SDcols = logscale_params] configs_list = map(seq_len(nrow(config_table)), function(i) { @@ -125,6 +131,9 @@ get_configspace = function(instance) { id = NULL # resolve global variable note in R CDM check row = instance$search_space$data[id == param_id, ] + # skip branch.selection if there is only one branch + if (param_id == "branch.selection" && row[["nlevels"]] == 1) return() + type = switch(row[["class"]], ParamFct = "categorical", ParamLgl = "categorical", @@ -165,6 +174,8 @@ get_configspace = function(instance) { q = NULL )) }) + hyperparameters_list = discard(hyperparameters_list, is.null) + conditions_list = map(setdiff(param_ids, "branch.selection"), function(param_id) { id = NULL # resolve global variable note in R CDM check @@ -178,6 +189,10 @@ get_configspace = function(instance) { } child = param_id parent = dependency[["on"]] + + # remove dependency on branch.selection if there is only one branch + nbranches = instance$search_space$data[id == "branch.selection", "nlevels", with = FALSE] + if (parent == "branch.selection" && nbranches == 1) return() # `cond` below is a list of `Condition`s. # Currently, there are only 'CondEqual' and 'CondAnyOf', which should not be used simultaneously. @@ -188,6 +203,7 @@ get_configspace = function(instance) { } return(list(child = child, parent = parent, type = "IN", values = cond$rhs)) }) + conditions_list = discard(conditions_list, is.null) return(list( hyperparameters = hyperparameters_list, From 8b2648bbdb179d039780f7c13b961f433b76b660 Mon Sep 17 00:00:00 2001 From: b-zhou Date: Sun, 18 Aug 2024 23:25:56 +0200 Subject: [PATCH 32/34] refactor: readability --- R/save_deepcave_run.R | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R index 8e8809e..3c4d280 100644 --- a/R/save_deepcave_run.R +++ b/R/save_deepcave_run.R @@ -35,11 +35,11 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", if (length(splitted) == 1) next # no run index attached idx = suppressWarnings(last(splitted)) - if (!is.na(idx)) { # idx is successfully coerced to a number - idx_int = as.integer(idx) - if (idx_int > new_idx) { - new_idx = idx_int - } + if (is.na(idx)) next # idx cannot be coerced to a number + + idx_int = as.integer(idx) + if (idx_int > new_idx) { + new_idx = idx_int } } @@ -83,7 +83,9 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", jsonlite::stream_out( get_history(instance), con, - auto_unbox = FALSE, pretty = TRUE, null = "list", na = "null", + # objectives must be a list, so do not auto unbox if a list has only one entry + auto_unbox = FALSE, + pretty = TRUE, null = "list", na = "null", dataframe = "values", verbose = FALSE ) @@ -103,19 +105,22 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", # Prepare the list for converting to `configs.json` get_configs = function(instance){ param_ids = instance$search_space$data$id + # skip branch.selection if there is only one level + id = NULL # resolve global variable note in R CDM check nbranches = instance$search_space$data[id == "branch.selection", "nlevels", with = FALSE] if (nbranches == 1) { param_ids = setdiff(param_ids, "branch.selection") } config_table = instance$archive$data[, param_ids, with = FALSE] + # param values in deepcave are on the original scale, not the log scale logscale_params = param_ids[instance$search_space$is_logscale[param_ids]] config_table[, (logscale_params) := lapply(.SD, exp), .SDcols = logscale_params] configs_list = map(seq_len(nrow(config_table)), function(i) { discard(as.list(config_table[i, ]), is.na) - }) + }) names(configs_list) = seq_along(configs_list) - 1 return(configs_list) @@ -124,7 +129,6 @@ get_configs = function(instance){ # Prepare the list for converting to `configspace.json` get_configspace = function(instance) { - n_params = nrow(instance$search_space$data) param_ids = instance$search_space$data$id hyperparameters_list = map(param_ids, function(param_id) { @@ -174,6 +178,7 @@ get_configspace = function(instance) { q = NULL )) }) + # skipping branch.selection results in null entries => discard them hyperparameters_list = discard(hyperparameters_list, is.null) @@ -203,6 +208,7 @@ get_configspace = function(instance) { } return(list(child = child, parent = parent, type = "IN", values = cond$rhs)) }) + # skipping branch.selection results in null entries => discard them conditions_list = discard(conditions_list, is.null) return(list( From 8d46679347dbda79eea314eae089b9c39857560b Mon Sep 17 00:00:00 2001 From: b-zhou Date: Mon, 19 Aug 2024 00:53:48 +0200 Subject: [PATCH 33/34] docs: save run --- R/save_deepcave_run.R | 31 +++++++++++++++++++++++++++++++ man/save_deepcave_run.Rd | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R index 3c4d280..9739d90 100644 --- a/R/save_deepcave_run.R +++ b/R/save_deepcave_run.R @@ -16,6 +16,37 @@ #' If `FALSE` (default), creates a new subfolder to save the current run. If `TRUE`, all existing runs will be deleted. #' #' @export +#' @examples +#' \dontrun{ +#' rush_plan(n_workers = 2) +#' task = tsk("penguins") +#' +#' learner1 = lrn("classif.auto", +#' learner_ids = c("svm", "ranger"), +#' small_data_size = 1, +#' resampling = rsmp("holdout"), +#' measure = msr("classif.ce"), +#' terminator = trm("evals", n_evals = 6) +#' ) +#' learner1$train(task) +#' # save to `logs/mlr3automl/run_1` +#' save_deepcave_run(learner1$instance) +#' +#' # save to `logs/mlr3automl/run` +#' # if this folder already exists, it will be overwritten +#' save_deepcave_run(learner1$instance, overwrite = TRUE) +#' +#' learner2 = lrn("classif.auto", +#' learner_ids = c("catboost", "xgboost"), +#' small_data_size = 1, +#' resampling = rsmp("holdout"), +#' measure = msr("classif.ce"), +#' terminator = trm("evals", n_evals = 6) +#' ) +#' learner2$train(task) +#' # save to `logs/mlr3automl/run_2` +#' save_deepcave_run(learner2$instance) +#' } save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", overwrite = FALSE) { # don't save untuned instance if (is.null(instance$result_learner_param_vals)) { diff --git a/man/save_deepcave_run.Rd b/man/save_deepcave_run.Rd index ab37cc8..61ca04d 100644 --- a/man/save_deepcave_run.Rd +++ b/man/save_deepcave_run.Rd @@ -27,3 +27,35 @@ If \code{FALSE} (default), creates a new subfolder to save the current run. If \ \description{ Exports information stored in a \code{TuningInstance} in a format recognized by \href{https://automl.github.io/DeepCAVE/main/index.html}{DeepCAVE} as a run. Each run is stored as a folder containing five files \code{configs.json}, \code{configspace.json}, \code{history.jsonl}, \code{meta.json}, and \code{origins.json}. } +\examples{ +\dontrun{ +rush_plan(n_workers = 2) +task = tsk("penguins") + +learner1 = lrn("classif.auto", + learner_ids = c("svm", "ranger"), + small_data_size = 1, + resampling = rsmp("holdout"), + measure = msr("classif.ce"), + terminator = trm("evals", n_evals = 6) +) +learner1$train(task) +# save to `logs/mlr3automl/run_1` +save_deepcave_run(learner1$instance) + +# save to `logs/mlr3automl/run` +# if this folder already exists, it will be overwritten +save_deepcave_run(learner1$instance, overwrite = TRUE) + +learner2 = lrn("classif.auto", + learner_ids = c("catboost", "xgboost"), + small_data_size = 1, + resampling = rsmp("holdout"), + measure = msr("classif.ce"), + terminator = trm("evals", n_evals = 6) +) +learner2$train(task) +# save to `logs/mlr3automl/run_2` +save_deepcave_run(learner2$instance) +} +} From cf7279a0ee2957cdd1f652028b6491678293681c Mon Sep 17 00:00:00 2001 From: b-zhou Date: Mon, 19 Aug 2024 00:54:11 +0200 Subject: [PATCH 34/34] fix: create dir --- R/save_deepcave_run.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R index 9739d90..c71745e 100644 --- a/R/save_deepcave_run.R +++ b/R/save_deepcave_run.R @@ -77,13 +77,13 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", new_idx = new_idx + 1 run_path = file.path(path, paste0(prefix, "_", new_idx)) - dir.create(run_path) + dir.create(run_path, recursive = TRUE) } else { run_path = file.path(path, prefix) if (file.exists(run_path)) { lapply(list.files(run_path, full.names = TRUE), file.remove) } else{ - dir.create(run_path) + dir.create(run_path, recursive = TRUE) } }