From 766bccd674bde31a2dcc5e15d512844abfa80a31 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Fri, 26 Jul 2024 10:36:52 +0200
Subject: [PATCH 01/34] tests: LearnerRegrAuto

---
 tests/testthat/test_LearnerRegrAuto.R | 296 ++++++++++++++++++++++++++
 1 file changed, 296 insertions(+)
 create mode 100644 tests/testthat/test_LearnerRegrAuto.R

diff --git a/tests/testthat/test_LearnerRegrAuto.R b/tests/testthat/test_LearnerRegrAuto.R
new file mode 100644
index 0000000..2bcf1e6
--- /dev/null
+++ b/tests/testthat/test_LearnerRegrAuto.R
@@ -0,0 +1,296 @@
+test_that("glmnet works (regr)", {
+  rush_plan(n_workers = 2)
+  skip_if_not_installed("glmnet")
+
+  task = tsk("boston_housing")
+  learner = lrn("regr.auto",
+    learner_ids = "glmnet",
+    small_data_size = 1,
+    resampling = rsmp("holdout"),
+    measure = msr("regr.mse"),
+    terminator = trm("evals", n_evals = 6)
+  )
+
+  expect_class(learner$train(task), "LearnerRegrAuto")
+  expect_equal(learner$graph$param_set$values$branch.selection, "glmnet")
+  expect_equal(learner$model$instance$result$branch.selection, "glmnet")
+})
+
+test_that("kknn works (regr)", {
+  rush_plan(n_workers = 2)
+  skip_if_not_installed("kknn")
+
+  task = tsk("boston_housing")
+  learner = lrn("regr.auto",
+    learner_ids = "kknn",
+    small_data_size = 1,
+    resampling = rsmp("holdout"),
+    measure = msr("regr.mse"),
+    terminator = trm("evals", n_evals = 6)
+  )
+
+  expect_class(learner$train(task), "LearnerRegrAuto")
+  expect_equal(learner$graph$param_set$values$branch.selection, "kknn")
+  expect_equal(learner$model$instance$result$branch.selection, "kknn")
+})
+
+test_that("nnet works (regr)", {
+  rush_plan(n_workers = 2)
+  skip_if_not_installed("nnet")
+
+  task = tsk("boston_housing")
+  learner = lrn("regr.auto",
+    learner_ids = "nnet",
+    resampling = rsmp("holdout"),
+    small_data_size = 1,
+    measure = msr("regr.mse"),
+    terminator = trm("evals", n_evals = 6)
+  )
+
+  expect_class(learner$train(task), "LearnerRegrAuto")
+  expect_equal(learner$model$instance$result$branch.selection, "nnet")
+})
+
+test_that("ranger works (regr)", {
+  rush_plan(n_workers = 2)
+  skip_if_not_installed("ranger")
+
+
+  task = tsk("boston_housing")
+  learner = lrn("regr.auto",
+    learner_ids = "ranger",
+    small_data_size = 1,
+    resampling = rsmp("holdout"),
+    measure = msr("regr.mse"),
+    terminator = trm("evals", n_evals = 6)
+  )
+
+  expect_class(learner$train(task), "LearnerRegrAuto")
+  expect_equal(learner$model$instance$result$branch.selection, "ranger")
+})
+
+test_that("svm works (regr)", {
+  rush_plan(n_workers = 2)
+  skip_if_not_installed("e1071")
+
+
+  task = tsk("boston_housing")
+  learner = lrn("regr.auto",
+    learner_ids = "svm",
+    small_data_size = 1,
+    resampling = rsmp("holdout"),
+    measure = msr("regr.mse"),
+    terminator = trm("evals", n_evals = 6)
+  )
+
+  expect_class(learner$train(task), "LearnerRegrAuto")
+  expect_equal(learner$model$instance$result$branch.selection, "svm")
+})
+
+test_that("xgboost works (regr)", {
+  skip_if_not_installed("xgboost")
+  rush_plan(n_workers = 2)
+
+  task = tsk("boston_housing")
+  learner = lrn("regr.auto",
+    learner_ids = "xgboost",
+    small_data_size = 1,
+    xgboost_eval_metric = "mlogloss",
+    resampling = rsmp("holdout"),
+    measure = msr("regr.mse"),
+    terminator = trm("evals", n_evals = 6)
+  )
+
+  expect_class(learner$train(task), "LearnerRegrAuto")
+  expect_equal(learner$model$instance$result$branch.selection, "xgboost")
+})
+
+test_that("catboost works (regr)", {
+  skip_if_not_installed("catboost")
+  rush_plan(n_workers = 2)
+
+  task = tsk("boston_housing")
+  learner = lrn("regr.auto",
+    learner_ids = "catboost",
+    small_data_size = 1,
+    # catboost_eval_metric = "MultiClass",
+    resampling = rsmp("holdout"),
+    measure = msr("regr.mse"),
+    terminator = trm("evals", n_evals = 6)
+  )
+
+  expect_class(learner$train(task), "LearnerRegrAuto")
+  expect_equal(learner$model$instance$result$branch.selection, "catboost")
+})
+
+test_that("only extra_trees fails", {
+  rush_plan(n_workers = 2)
+
+  task = tsk("boston_housing")
+  expect_error(lrn("regr.auto",
+    learner_ids = "extra_trees",
+    resampling = rsmp("holdout"),
+    measure = msr("regr.mse"),
+    terminator = trm("evals", n_evals = 6)
+  ), "must be combined with other learners")
+})
+
+test_that("extra_trees and glmnet works (regr)", {
+  skip_if_not_installed("glmnet")
+  rush_plan(n_workers = 2)
+
+  task = tsk("boston_housing")
+  learner = lrn("regr.auto",
+    learner_ids = c("extra_trees", "glmnet"),
+    resampling = rsmp("holdout"),
+    measure = msr("regr.mse"),
+    terminator = trm("evals", n_evals = 6)
+  )
+
+  expect_class(learner$train(task), "LearnerRegrAuto")
+  expect_equal(learner$model$instance$result$branch.selection, "extra_trees")
+})
+
+test_that("lightgbm works (regr)", {
+  skip_if_not_installed("lightgbm")
+  rush_plan(n_workers = 2)
+
+  task = tsk("boston_housing")
+  learner = lrn("regr.auto",
+    learner_ids = "lightgbm",
+    lightgbm_eval_metric = "multi_logloss",
+    resampling = rsmp("holdout"),
+    measure = msr("regr.mse"),
+    terminator = trm("evals", n_evals = 6)
+  )
+
+  expect_class(learner$train(task), "LearnerRegrAuto")
+  expect_equal(learner$model$instance$result$branch.selection, "lightgbm")
+})
+
+test_that("xgboost, catboost and lightgbm work (regr)", {
+  skip_if_not_installed(c("xgboost", "catboost", "lightgbm"))
+  rush_plan(n_workers = 2)
+
+  task = tsk("boston_housing")
+  learner = lrn("regr.auto",
+    learner_ids = c("xgboost", "catboost", "lightgbm"),
+    catboost_eval_metric = "MultiClass",
+    lightgbm_eval_metric = "multi_logloss",
+    xgboost_eval_metric = "mlogloss",
+    resampling = rsmp("holdout"),
+    lhs_size = 1,
+    measure = msr("regr.mse"),
+    terminator = trm("evals", n_evals = 20),
+    callbacks = clbk("mlr3tuning.async_save_logs")
+  )
+
+  expect_class(learner$train(task), "LearnerRegrAuto")
+})
+
+test_that("all learner work (regr)", {
+  skip_if_not_installed(c("glmnet", "kknn", "nnet", "ranger", "e1071", "xgboost", "catboost", "MASS", "lightgbm"))
+  rush_plan(n_workers = 2)
+
+  task = tsk("boston_housing")
+  learner = lrn("regr.auto",
+    small_data_size = 100,
+    measure = msr("regr.mse"),
+    terminator = trm("evals", n_evals = 20),
+    lhs_size = 1
+  )
+
+  expect_class(learner$train(task), "LearnerRegrAuto")
+  expect_class(learner$model$instance, "TuningInstanceAsyncSingleCrit")
+  expect_prediction(learner$predict(task))
+})
+
+# test_that("memory limit works", {
+#   skip_if_not_installed(c("glmnet", "kknn", "nnet", "ranger", "e1071", "xgboost", "catboost", "MASS", "lightgbm"))
+#   rush_plan(n_workers = 2)
+
+#   task = tsk("spam")
+#   learner = lrn("regr.auto",
+#     max_memory = 50,
+#     small_data_size = 100,
+#     measure = msr("regr.mse"),
+#     terminator = trm("evals", n_evals = 20),
+#     resampling = rsmp("holdout"),
+#     lhs_size = 1
+#   )
+
+#   learner$train(task)
+# })
+
+test_that("small data set switch works (regr)", {
+  skip_if_not_installed(c("glmnet", "kknn", "nnet", "ranger", "e1071", "xgboost", "catboost", "lightgbm"))
+  rush_plan(n_workers = 2)
+
+  task = tsk("boston_housing")
+  learner = lrn("regr.auto",
+    small_data_size = 1000,
+    small_data_resampling = rsmp("cv", folds = 2),
+    measure = msr("regr.mse"),
+    terminator = trm("evals", n_evals = 1),
+    lhs_size = 1,
+    store_benchmark_result = TRUE
+  )
+
+  expect_class(learner$train(task), "LearnerRegrAuto")
+  expect_equal(learner$model$instance$archive$benchmark_result$resamplings$resampling[[1]]$iters, 2)
+})
+
+test_that("large data set switch works (regr)", {
+  skip_if_not_installed(c("glmnet", "kknn", "nnet", "ranger", "e1071", "xgboost", "catboost", "lightgbm"))
+  rush_plan(n_workers = 2)
+
+  task = tsk("boston_housing")
+  learner = lrn("regr.auto",
+    large_data_size = 100,
+    large_data_nthread = 4,
+    large_data_learner_ids = "ranger",
+    small_data_size = 100,
+    measure = msr("regr.mse"),
+    terminator = trm("evals", n_evals = 1),
+    lhs_size = 1,
+    store_benchmark_result = TRUE
+  )
+
+  expect_class(learner$train(task), "LearnerRegrAuto")
+  expect_set_equal(learner$model$instance$archive$data$branch.selection, "ranger")
+})
+
+test_that("max_cardinality works (regr)", {
+  skip_if_not_installed(c("glmnet", "kknn", "nnet", "ranger", "e1071", "xgboost", "catboost", "lightgbm"))
+  rush_plan(n_workers = 2)
+
+  task = tsk("boston_housing")
+  learner = lrn("regr.auto",
+    small_data_size = 1,
+    resampling = rsmp("holdout"),
+    max_cardinality = 2,
+    measure = msr("regr.mse"),
+    terminator = trm("evals", n_evals = 1),
+    lhs_size = 1
+  )
+
+  expect_class(learner$train(task), "LearnerRegrAuto")
+})
+
+test_that("max_cardinality works for extra trees (regr)", {
+  skip_if_not_installed(c("glmnet", "kknn", "nnet", "ranger", "e1071", "xgboost", "catboost", "lightgbm"))
+  rush_plan(n_workers = 2)
+
+  task = tsk("boston_housing")
+  learner = lrn("regr.auto",
+    small_data_size = 1,
+    resampling = rsmp("holdout"),
+    max_cardinality = 3,
+    extra_trees_max_cardinality = 2,
+    measure = msr("regr.mse"),
+    terminator = trm("evals", n_evals = 1),
+    lhs_size = 1
+  )
+
+  expect_class(learner$train(task), "LearnerRegrAuto")
+})

From 30de063b99bb5d07458e70f582ccc6f3feca0213 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Fri, 26 Jul 2024 10:48:41 +0200
Subject: [PATCH 02/34] feat: LearnerRegrAuto

---
 R/LearnerClassifAuto.R | 127 +-----------------------
 R/LearnerRegrAuto.R    | 214 ++++++++++++++++++++++++++++++++++++++++-
 R/helper.R             | 139 ++++++++++++++++++++++++++
 3 files changed, 353 insertions(+), 127 deletions(-)

diff --git a/R/LearnerClassifAuto.R b/R/LearnerClassifAuto.R
index b3b126e..132ab31 100644
--- a/R/LearnerClassifAuto.R
+++ b/R/LearnerClassifAuto.R
@@ -89,7 +89,7 @@ LearnerClassifAuto = R6Class("LearnerClassifAuto",
    .train = function(task) {
       pv = self$param_set$values
       learner_ids = pv$learner_ids
-      self$graph = build_graph(learner_ids)
+      self$graph = build_graph(learner_ids, task_type = "classif")
       self$tuning_space = tuning_space[learner_ids]
 
       lg$debug("Training '%s' on task '%s'", self$id, task$id)
@@ -252,131 +252,6 @@ LearnerClassifAuto = R6Class("LearnerClassifAuto",
 #' @include aaa.R
 learners[["classif.auto"]] = LearnerClassifAuto
 
-build_graph = function(learner_ids) {
-  branches = list()
-  # glmnet
-  if ("glmnet" %in% learner_ids) {
-    branch_glmnet = po("removeconstants", id = "glmnet_removeconstants") %>>%
-      po("imputehist", id = "glmnet_imputehist") %>>%
-      po("imputeoor", id = "glmnet_imputeoor") %>>%
-      po("fixfactors", id = "glmnet_fixfactors") %>>%
-      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "glmnet_imputesample") %>>%
-      po("collapsefactors", target_level_count = 100, id = "glmnet_collapse") %>>%
-      po("encode", method = "one-hot", id = "glmnet_encode") %>>%
-      po("removeconstants", id = "glmnet_post_removeconstants") %>>%
-      lrn("classif.glmnet", id = "glmnet")
-    branches = c(branches, branch_glmnet)
-  }
-
-  # kknn
-  if ("kknn" %in% learner_ids) {
-    branch_kknn = po("removeconstants", id = "kknn_removeconstants") %>>%
-      po("imputehist", id = "kknn_imputehist") %>>%
-      po("imputeoor", id = "kknn_imputeoor") %>>%
-      po("fixfactors", id = "kknn_fixfactors") %>>%
-      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "kknn_imputesample") %>>%
-      po("collapsefactors", target_level_count = 100, id = "kknn_collapse") %>>%
-      po("removeconstants", id = "kknn_post_removeconstants") %>>%
-      lrn("classif.kknn", id = "kknn")
-    branches = c(branches, branch_kknn)
-  }
-
-  # lda
-  if ("lda" %in% learner_ids) {
-    branch_lda = po("removeconstants", id = "lda_removeconstants") %>>%
-      po("imputehist", id = "lda_imputehist") %>>%
-      po("imputeoor", id = "lda_imputeoor") %>>%
-      po("fixfactors", id = "lda_fixfactors") %>>%
-      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "lda_imputesample") %>>%
-      po("collapsefactors", target_level_count = 100, id = "lda_collapse") %>>%
-      po("removeconstants", id = "lda_post_removeconstants") %>>%
-      lrn("classif.lda", id = "lda")
-    branches = c(branches, branch_lda)
-  }
-
-  # nnet
-  if ("nnet" %in% learner_ids) {
-    branch_nnet = po("removeconstants", id = "nnet_removeconstants") %>>%
-      po("imputehist", id = "nnet_imputehist") %>>%
-      po("imputeoor", id = "nnet_imputeoor") %>>%
-      po("fixfactors", id = "nnet_fixfactors") %>>%
-      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "nnet_imputesample") %>>%
-      po("collapsefactors", target_level_count = 100, id = "nnet_collapse") %>>%
-      po("removeconstants", id = "nnet_post_removeconstants") %>>%
-      lrn("classif.nnet", id = "nnet")
-    branches = c(branches, branch_nnet)
-  }
-
-  # ranger
-  if ("ranger" %in% learner_ids) {
-    branch_ranger = po("removeconstants", id = "ranger_removeconstants") %>>%
-      po("imputeoor", id = "ranger_imputeoor") %>>%
-      po("fixfactors", id = "ranger_fixfactors") %>>%
-      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "ranger_imputesample") %>>%
-      po("collapsefactors", target_level_count = 100, id = "ranger_collapse") %>>%
-      po("removeconstants", id = "ranger_post_removeconstants") %>>%
-      # use upper bound of search space for memory estimation
-      lrn("classif.ranger", id = "ranger", num.trees = 2000)
-    branches = c(branches, branch_ranger)
-  }
-
-  # svm
-  if ("svm" %in% learner_ids) {
-    branch_svm = po("removeconstants", id = "svm_removeconstants") %>>%
-      po("imputehist", id = "svm_imputehist") %>>%
-      po("imputeoor", id = "svm_imputeoor") %>>%
-      po("fixfactors", id = "svm_fixfactors") %>>%
-      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "svm_imputesample") %>>%
-      po("collapsefactors", target_level_count = 100, id = "svm_collapse") %>>%
-      po("encode", method = "one-hot", id = "svm_encode") %>>%
-      po("removeconstants", id = "svm_post_removeconstants") %>>%
-      lrn("classif.svm", id = "svm", type = "C-classification")
-    branches = c(branches, branch_svm)
-  }
-
-  # xgboost
-  if ("xgboost" %in% learner_ids) {
-    branch_xgboost = po("removeconstants", id = "xgboost_removeconstants") %>>%
-      po("imputeoor", id = "xgboost_imputeoor") %>>%
-      po("fixfactors", id = "xgboost_fixfactors") %>>%
-      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "xgboost_imputesample") %>>%
-      po("encodeimpact", id = "xgboost_encode") %>>%
-      po("removeconstants", id = "xgboost_post_removeconstants") %>>%
-      lrn("classif.xgboost", id = "xgboost", nrounds = 5000, early_stopping_rounds = 10)
-    branches = c(branches, branch_xgboost)
-  }
-
-  # catboost
-  if ("catboost" %in% learner_ids) {
-    branch_catboost = po("colapply", id = "catboost_colapply", applicator = as.numeric, affect_columns = selector_type("integer")) %>>%
-      lrn("classif.catboost", id = "catboost", iterations = 500, early_stopping_rounds = 10, use_best_model = TRUE)
-    branches = c(branches, branch_catboost)
-  }
-
-  # extra trees
-  if ("extra_trees" %in% learner_ids) {
-    branch_extra_trees = po("removeconstants", id = "extra_trees_removeconstants") %>>%
-      po("imputeoor", id = "extra_trees_imputeoor") %>>%
-      po("fixfactors", id = "extra_trees_fixfactors") %>>%
-      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "extra_trees_imputesample") %>>%
-      po("collapsefactors", target_level_count = 40, id = "extra_trees_collapse") %>>%
-      po("removeconstants", id = "extra_trees_post_removeconstants") %>>%
-      lrn("classif.ranger", id = "extra_trees", splitrule = "extratrees", num.trees = 100, replace = FALSE, sample.fraction = 1)
-    branches = c(branches, branch_extra_trees)
-  }
-
-  # lightgbm
-  if ("lightgbm" %in% learner_ids) {
-    branch_lightgbm = lrn("classif.lightgbm", id = "lightgbm", num_iterations = 5000, early_stopping_rounds = 10)
-    branches = c(branches, branch_lightgbm)
-  }
-
-  # branch graph
-  po("branch", options = learner_ids) %>>%
-    gunion(branches) %>>%
-    po("unbranch", options = learner_ids)
-}
-
 tuning_space = list(
   glmnet = list(
     glmnet.s     = to_tune(1e-4, 1e4, logscale = TRUE),
diff --git a/R/LearnerRegrAuto.R b/R/LearnerRegrAuto.R
index bee6e86..9db316b 100644
--- a/R/LearnerRegrAuto.R
+++ b/R/LearnerRegrAuto.R
@@ -23,16 +23,228 @@ LearnerRegrAuto = R6Class("LearnerRegrAuto",
     #' @description
     #' Creates a new instance of this [R6][R6::R6Class] class.
     initialize = function(id = "classif.auto") {
+      param_set = ps(
+        # learner
+        learner_ids = p_uty(default = c("glmnet", "kknn", "lda", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm"),
+          custom_check = function(x) {
+            if (all(x %in% c("lda", "extra_trees"))) {
+              return("Learner 'lda' and 'extra_trees' must be combined with other learners")
+            }
+            check_subset(x, c("glmnet", "kknn", "lda", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm"))
+        }),
+        learner_timeout = p_int(lower = 1L, default = 900L),
+        xgboost_eval_metric = p_uty(),
+        catboost_eval_metric = p_uty(),
+        lightgbm_eval_metric = p_uty(),
+        # system
+        max_nthread = p_int(lower = 1L, default = 1L),
+        max_memory = p_int(lower = 1L, default = 32000L),
+        # large data
+        large_data_size = p_int(lower = 1L, default = 1e6),
+        large_data_learner_ids = p_uty(),
+        large_data_nthread = p_int(lower = 1L, default = 4L),
+        # small data
+        small_data_size = p_int(lower = 1L, default = 5000L),
+        small_data_resampling = p_uty(),
+        max_cardinality = p_int(lower = 1L, default = 100L),
+        extra_trees_max_cardinality = p_int(lower = 1L, default = 40L),
+        # tuner
+        resampling = p_uty(),
+        terminator = p_uty(),
+        measure = p_uty(),
+        lhs_size = p_int(lower = 1L, default = 4L),
+        callbacks = p_uty(),
+        store_benchmark_result = p_lgl(default = FALSE))
 
+      param_set$set_values(
+        learner_ids = c("glmnet", "kknn", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm"),
+        learner_timeout = 900L,
+        max_nthread = 1L,
+        max_memory = 32000L,
+        large_data_size = 1e6L,
+        large_data_learner_ids = c("ranger", "xgboost", "catboost", "extra_trees", "lightgbm"),
+        large_data_nthread = 4L,
+        small_data_size = 5000L,
+        small_data_resampling = rsmp("cv", folds = 10L),
+        max_cardinality = 100L,
+        extra_trees_max_cardinality = 40L,
+        resampling = rsmp("cv", folds = 3L),
+        terminator = trm("run_time", secs = 14400L),
+        measure = msr("regr.mse"),
+        lhs_size = 4L,
+        store_benchmark_result = FALSE)
+
+      super$initialize(
+        id = id,
+        task_type = "regr",
+        param_set = param_set,
+        packages = c("mlr3tuning", "mlr3learners", "mlr3pipelines", "mlr3mbo", "mlr3automl", "xgboost", "catboost", "lightgbm", "ranger", "nnet", "kknn", "glmnet", "e1071"),
+        feature_types = c("logical", "integer", "numeric", "character", "factor"),
+        predict_types = "response",
+        properties = c("missings", "weights")
+      )
     }
   ),
   private = list(
     .train = function(task) {
+      pv = self$param_set$values
+      learner_ids = pv$learner_ids
+      self$graph = build_graph(learner_ids, task_type = "regr")
+      self$tuning_space = tuning_space[learner_ids]
+
+      lg$debug("Training '%s' on task '%s'", self$id, task$id)
+
+      # initialize mbo tuner
+      tuner = tnr("adbo")
+
+      # remove learner based on memory limit
+      lg$debug("Starting to select from %i learners: %s", length(learner_ids), paste0(learner_ids, collapse = ","))
+
+      if (!is.null(pv$max_memory)) {
+        memory_usage = map_dbl(learner_ids, function(learner_id) {
+          self$graph$pipeops[[learner_id]]$learner$estimate_memory_usage(task) / 1e6
+        })
+        learner_ids = learner_ids[memory_usage < pv$max_memory]
+        lg$debug("Checking learners for memory limit of %i MB. Keeping %i learner(s): %s", pv$max_memory, length(learner_ids), paste0(learner_ids, collapse = ","))
+      }
+
+      # set number of threads
+      if (!is.null(pv$max_nthread)) {
+        lg$debug("Setting number of threads per learner to %i", pv$max_nthread)
+        walk(learner_ids, function(learner_id) {
+          set_threads(self$graph$pipeops[[learner_id]]$learner, pv$max_nthread)
+        })
+      }
+
+      # reduce number of workers on large data sets
+      if (!is.null(pv$large_data_size) && task$nrow * task$ncol > pv$large_data_size) {
+        lg$debug("Task size larger than %i rows", pv$large_data_size)
+
+        learner_ids = intersect(learner_ids, pv$large_data_learner_ids)
+        self$tuning_space = tuning_space[learner_ids]
+        lg$debug("Keeping %i learner(s): %s", length(learner_ids), paste0(learner_ids, collapse = ","))
+
+        lg$debug("Increasing number of threads per learner to %i", pv$large_data_nthread)
+        walk(learner_ids, function(learner_id) {
+          set_threads(self$graph$pipeops[[learner_id]]$learner, pv$large_data_nthread)
+        })
+        n_workers = rush_config()$n_workers
+        n = max(1, floor(n_workers / pv$large_data_nthread))
+        tuner$param_set$set_values(n_workers = n)
+        lg$debug("Reducing number of workers to %i", n)
+      }
+
+      # small data resampling
+      resampling = if (!is.null(pv$small_data_size) && task$nrow < pv$small_data_size) {
+        lg$debug("Task has less than %i rows", pv$small_data_size)
+        lg$debug("Using small data set resampling with %i iterations", pv$small_data_resampling$iters)
+        pv$small_data_resampling
+      } else {
+        pv$resampling
+      }
+
+      # cardinality
+      cardinality = map_int(task$col_info$levels, length)
+      if (!is.null(pv$max_cardinality) && any(cardinality > pv$max_cardinality)) {
+        lg$debug("Reducing number of factor levels to %i", pv$max_cardinality)
 
+        # collapse factors
+        pipeop_ids = names(self$graph$pipeops)
+        pipeop_ids = pipeop_ids[grep("collapse", pipeop_ids)]
+        walk(pipeop_ids, function(pipeop_id) {
+          self$graph$pipeops[[pipeop_id]]$param_set$values$target_level_count = pv$max_cardinality
+        })
+      }
+
+      if ("extra_trees" %in% learner_ids && any(cardinality > pv$extra_trees_max_cardinality))  {
+        lg$debug("Reducing number of factor levels to %i for extra trees", pv$extra_trees_max_cardinality)
+        self$graph$pipeops$extra_trees_collapse$param_set$values$target_level_count = pv$extra_trees_max_cardinality
+      }
+
+      # initialize graph learner
+      graph_learner = as_learner(self$graph)
+      graph_learner$id = "graph_learner"
+      graph_learner$predict_type = pv$measure$predict_type
+      graph_learner$fallback = lrn("regr.featureless", predict_type = pv$measure$predict_type)
+      graph_learner$encapsulate = c(train = "callr", predict = "callr")
+      graph_learner$timeout = c(train = pv$learner_timeout, predict = pv$learner_timeout)
+
+      learners_with_validation = intersect(learner_ids, c("xgboost", "catboost", "lightgbm"))
+      if (length(learners_with_validation)) {
+        set_validate(graph_learner, "test", ids = learners_with_validation)
+      }
+
+      # set early stopping
+      if ("xgboost" %in% learner_ids) {
+        graph_learner$param_set$values$xgboost.callbacks = list(cb_timeout_xgboost(pv$learner_timeout * 0.8))
+        graph_learner$param_set$values$xgboost.eval_metric = pv$xgboost_eval_metric
+      }
+      if ("catboost" %in% learner_ids) {
+        graph_learner$param_set$values$catboost.eval_metric = pv$catboost_eval_metric
+      }
+      if ("lightgbm" %in% learner_ids) {
+        graph_learner$param_set$values$lightgbm.callbacks = list(cb_timeout_lightgbm(pv$learner_timeout * 0.8))
+        graph_learner$param_set$values$lightgbm.eval = pv$lightgbm_eval_metric
+      }
+
+      # initialize search space
+      tuning_space = unlist(unname(self$tuning_space), recursive = FALSE)
+      graph_scratch = graph_learner$clone(deep = TRUE)
+      graph_scratch$param_set$set_values(.values = tuning_space)
+      graph_scratch$param_set$set_values(branch.selection = to_tune(learner_ids))
+      search_space = graph_scratch$param_set$search_space()
+      walk(learner_ids, function(learner_id) {
+        param_ids = search_space$ids()
+        param_ids = grep(paste0("^", learner_id), param_ids, value = TRUE)
+        walk(param_ids, function(param_id) {
+          # skip internal tuning parameter
+          if (param_id %in% c("xgboost.nrounds", "catboost.iterations", "lightgbm.num_iterations")) return()
+          search_space$add_dep(
+            id = param_id,
+            on = "branch.selection",
+            cond = CondEqual$new(learner_id)
+          )
+        })
+      })
+
+      # initial design
+      lhs_xdt = generate_lhs_design(pv$lhs_size, self$task_type, setdiff(learner_ids, c("lda", "extra_trees")), self$tuning_space)
+      default_xdt = generate_default_design(self$task_type, learner_ids, task, self$tuning_space)
+      initial_xdt = rbindlist(list(lhs_xdt, default_xdt), use.names = TRUE, fill = TRUE)
+      setorderv(initial_xdt, "branch.selection")
+      tuner$param_set$set_values(initial_design = initial_xdt)
+
+      # initialize auto tuner
+      self$instance = ti_async(
+        task = task,
+        learner = graph_learner,
+        resampling = resampling,
+        measures = pv$measure,
+        terminator = pv$terminator,
+        search_space = search_space,
+        callbacks = pv$callbacks,
+        store_benchmark_result = pv$store_benchmark_result
+      )
+
+      # tune
+      lg$debug("Learner '%s' starts tuning phase", self$id)
+      tuner$optimize(self$instance)
+
+      # fit final model
+      lg$debug("Learner '%s' fits final model", self$id)
+      if (length(learners_with_validation)) {
+        set_validate(graph_learner, NULL, ids = intersect(learner_ids, c("xgboost", "catboost", "lightgbm")))
+      }
+      graph_learner$param_set$set_values(.values = self$instance$result_learner_param_vals, .insert = FALSE)
+      graph_learner$timeout = c(train = Inf, predict = Inf)
+      graph_learner$train(task)
+
+      list(graph_learner = graph_learner, instance = self$instance)
     },
 
     .predict = function(task) {
-
+      lg$debug("Predicting with '%s' on task '%s'", self$id, task$id)
+      self$model$graph_learner$predict(task)
     }
   )
 )
diff --git a/R/helper.R b/R/helper.R
index 4b27c1c..8d4ff40 100644
--- a/R/helper.R
+++ b/R/helper.R
@@ -1,3 +1,142 @@
+build_graph = function(learner_ids, task_type) {
+  assert_choice(task_type, c("classif", "regr"))
+  learners_reg = c("glmnet", "kknn", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm")
+  if (task_type == "regr") {
+    assert_subset(learner_ids, learners_reg)
+  } else {
+    assert_subset(learner_ids, c(learners_reg, "lda"))
+  }
+
+  branches = list()
+  # glmnet
+  if ("glmnet" %in% learner_ids) {
+    branch_glmnet = po("removeconstants", id = "glmnet_removeconstants") %>>%
+      po("imputehist", id = "glmnet_imputehist") %>>%
+      po("imputeoor", id = "glmnet_imputeoor") %>>%
+      po("fixfactors", id = "glmnet_fixfactors") %>>%
+      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "glmnet_imputesample") %>>%
+      po("collapsefactors", target_level_count = 100, id = "glmnet_collapse") %>>%
+      po("encode", method = "one-hot", id = "glmnet_encode") %>>%
+      po("removeconstants", id = "glmnet_post_removeconstants") %>>%
+      lrn(paste0(task_type, ".glmnet"), id = "glmnet")
+    branches = c(branches, branch_glmnet)
+  }
+
+  # kknn
+  if ("kknn" %in% learner_ids) {
+    branch_kknn = po("removeconstants", id = "kknn_removeconstants") %>>%
+      po("imputehist", id = "kknn_imputehist") %>>%
+      po("imputeoor", id = "kknn_imputeoor") %>>%
+      po("fixfactors", id = "kknn_fixfactors") %>>%
+      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "kknn_imputesample") %>>%
+      po("collapsefactors", target_level_count = 100, id = "kknn_collapse") %>>%
+      po("removeconstants", id = "kknn_post_removeconstants") %>>%
+      lrn(paste0(task_type, ".kknn"), id = "kknn")
+    branches = c(branches, branch_kknn)
+  }
+
+  # lda
+  # only for classification
+  if ("lda" %in% learner_ids) {
+    branch_lda = po("removeconstants", id = "lda_removeconstants") %>>%
+      po("imputehist", id = "lda_imputehist") %>>%
+      po("imputeoor", id = "lda_imputeoor") %>>%
+      po("fixfactors", id = "lda_fixfactors") %>>%
+      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "lda_imputesample") %>>%
+      po("collapsefactors", target_level_count = 100, id = "lda_collapse") %>>%
+      po("removeconstants", id = "lda_post_removeconstants") %>>%
+      lrn("classif.lda", id = "lda")
+    branches = c(branches, branch_lda)
+  }
+
+  # nnet
+  if ("nnet" %in% learner_ids) {
+    branch_nnet = po("removeconstants", id = "nnet_removeconstants") %>>%
+      po("imputehist", id = "nnet_imputehist") %>>%
+      po("imputeoor", id = "nnet_imputeoor") %>>%
+      po("fixfactors", id = "nnet_fixfactors") %>>%
+      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "nnet_imputesample") %>>%
+      po("collapsefactors", target_level_count = 100, id = "nnet_collapse") %>>%
+      po("removeconstants", id = "nnet_post_removeconstants") %>>%
+      lrn(paste0(task_type, ".nnet"), id = "nnet")
+    branches = c(branches, branch_nnet)
+  }
+
+  # ranger
+  if ("ranger" %in% learner_ids) {
+    branch_ranger = po("removeconstants", id = "ranger_removeconstants") %>>%
+      po("imputeoor", id = "ranger_imputeoor") %>>%
+      po("fixfactors", id = "ranger_fixfactors") %>>%
+      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "ranger_imputesample") %>>%
+      po("collapsefactors", target_level_count = 100, id = "ranger_collapse") %>>%
+      po("removeconstants", id = "ranger_post_removeconstants") %>>%
+      # use upper bound of search space for memory estimation
+      lrn(paste0(task_type, ".ranger"), id = "ranger", num.trees = 2000)
+    branches = c(branches, branch_ranger)
+  }
+
+  # svm
+  if ("svm" %in% learner_ids) {
+    svm_type = if (task_type == "classif") {
+      "C-classification"
+    } else {
+      "eps-regression"
+    }
+    branch_svm = po("removeconstants", id = "svm_removeconstants") %>>%
+      po("imputehist", id = "svm_imputehist") %>>%
+      po("imputeoor", id = "svm_imputeoor") %>>%
+      po("fixfactors", id = "svm_fixfactors") %>>%
+      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "svm_imputesample") %>>%
+      po("collapsefactors", target_level_count = 100, id = "svm_collapse") %>>%
+      po("encode", method = "one-hot", id = "svm_encode") %>>%
+      po("removeconstants", id = "svm_post_removeconstants") %>>%
+      lrn(paste0(task_type, ".svm"), id = "svm", type = svm_type)
+    branches = c(branches, branch_svm)
+  }
+
+  # xgboost
+  if ("xgboost" %in% learner_ids) {
+    branch_xgboost = po("removeconstants", id = "xgboost_removeconstants") %>>%
+      po("imputeoor", id = "xgboost_imputeoor") %>>%
+      po("fixfactors", id = "xgboost_fixfactors") %>>%
+      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "xgboost_imputesample") %>>%
+      po("encodeimpact", id = "xgboost_encode") %>>%
+      po("removeconstants", id = "xgboost_post_removeconstants") %>>%
+      lrn(paste0(task_type, ".xgboost"), id = "xgboost", nrounds = 5000, early_stopping_rounds = 10)
+    branches = c(branches, branch_xgboost)
+  }
+
+  # catboost
+  if ("catboost" %in% learner_ids) {
+    branch_catboost = po("colapply", id = "catboost_colapply", applicator = as.numeric, affect_columns = selector_type("integer")) %>>%
+      lrn(paste0(task_type, ".catboost"), id = "catboost", iterations = 500, early_stopping_rounds = 10, use_best_model = TRUE)
+    branches = c(branches, branch_catboost)
+  }
+
+  # extra trees
+  if ("extra_trees" %in% learner_ids) {
+    branch_extra_trees = po("removeconstants", id = "extra_trees_removeconstants") %>>%
+      po("imputeoor", id = "extra_trees_imputeoor") %>>%
+      po("fixfactors", id = "extra_trees_fixfactors") %>>%
+      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "extra_trees_imputesample") %>>%
+      po("collapsefactors", target_level_count = 40, id = "extra_trees_collapse") %>>%
+      po("removeconstants", id = "extra_trees_post_removeconstants") %>>%
+      lrn(paste0(task_type, ".ranger"), id = "extra_trees", splitrule = "extratrees", num.trees = 100, replace = FALSE, sample.fraction = 1)
+    branches = c(branches, branch_extra_trees)
+  }
+
+  # lightgbm
+  if ("lightgbm" %in% learner_ids) {
+    branch_lightgbm = lrn(paste0(task_type, ".lightgbm"), id = "lightgbm", num_iterations = 5000, early_stopping_rounds = 10)
+    branches = c(branches, branch_lightgbm)
+  }
+
+  # branch graph
+  po("branch", options = learner_ids) %>>%
+    gunion(branches) %>>%
+    po("unbranch", options = learner_ids)
+}
+
 generate_default_design = function(task_type, learner_ids, task, tuning_space, branch = TRUE) {
   map_dtr(learner_ids, function(learner_id) {
     if (paste0(task_type, ".", learner_id) %nin% mlr_learners$keys()) {

From 2c99b98d2051ce4fbb27257a772f644a8d0ffa70 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Fri, 26 Jul 2024 11:02:45 +0200
Subject: [PATCH 03/34] test: extra trees and eval metrics

---
 tests/testthat/test_LearnerRegrAuto.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/testthat/test_LearnerRegrAuto.R b/tests/testthat/test_LearnerRegrAuto.R
index 2bcf1e6..4b274cc 100644
--- a/tests/testthat/test_LearnerRegrAuto.R
+++ b/tests/testthat/test_LearnerRegrAuto.R
@@ -148,7 +148,7 @@ test_that("extra_trees and glmnet works (regr)", {
   )
 
   expect_class(learner$train(task), "LearnerRegrAuto")
-  expect_equal(learner$model$instance$result$branch.selection, "extra_trees")
+  expect_choice(learner$model$instance$result$branch.selection, c("extra_trees", "glmnet"))
 })
 
 test_that("lightgbm works (regr)", {
@@ -175,9 +175,9 @@ test_that("xgboost, catboost and lightgbm work (regr)", {
   task = tsk("boston_housing")
   learner = lrn("regr.auto",
     learner_ids = c("xgboost", "catboost", "lightgbm"),
-    catboost_eval_metric = "MultiClass",
-    lightgbm_eval_metric = "multi_logloss",
-    xgboost_eval_metric = "mlogloss",
+    # catboost_eval_metric = "MultiClass",
+    # lightgbm_eval_metric = "multi_logloss",
+    # xgboost_eval_metric = "mlogloss",
     resampling = rsmp("holdout"),
     lhs_size = 1,
     measure = msr("regr.mse"),

From 4bb4d0c716da7398985283d61fb209c6fa07c057 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Fri, 26 Jul 2024 11:14:38 +0200
Subject: [PATCH 04/34] fix: remove lda

---
 R/LearnerRegrAuto.R | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/R/LearnerRegrAuto.R b/R/LearnerRegrAuto.R
index 9db316b..20ccf50 100644
--- a/R/LearnerRegrAuto.R
+++ b/R/LearnerRegrAuto.R
@@ -25,12 +25,12 @@ LearnerRegrAuto = R6Class("LearnerRegrAuto",
     initialize = function(id = "classif.auto") {
       param_set = ps(
         # learner
-        learner_ids = p_uty(default = c("glmnet", "kknn", "lda", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm"),
+        learner_ids = p_uty(default = c("glmnet", "kknn", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm"),
           custom_check = function(x) {
-            if (all(x %in% c("lda", "extra_trees"))) {
-              return("Learner 'lda' and 'extra_trees' must be combined with other learners")
+            if (length(x) == 1 && x == "extra_trees") {
+              return("Learner 'extra_trees' must be combined with other learners")
             }
-            check_subset(x, c("glmnet", "kknn", "lda", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm"))
+            check_subset(x, c("glmnet", "kknn", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm"))
         }),
         learner_timeout = p_int(lower = 1L, default = 900L),
         xgboost_eval_metric = p_uty(),
@@ -208,7 +208,7 @@ LearnerRegrAuto = R6Class("LearnerRegrAuto",
       })
 
       # initial design
-      lhs_xdt = generate_lhs_design(pv$lhs_size, self$task_type, setdiff(learner_ids, c("lda", "extra_trees")), self$tuning_space)
+      lhs_xdt = generate_lhs_design(pv$lhs_size, self$task_type, setdiff(learner_ids, "extra_trees"), self$tuning_space)
       default_xdt = generate_default_design(self$task_type, learner_ids, task, self$tuning_space)
       initial_xdt = rbindlist(list(lhs_xdt, default_xdt), use.names = TRUE, fill = TRUE)
       setorderv(initial_xdt, "branch.selection")

From 823bced553f0f2e63211518b4d1660e83995745b Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Fri, 26 Jul 2024 12:15:56 +0200
Subject: [PATCH 05/34] refactor: build_graph

---
 R/build_graph.R | 202 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 202 insertions(+)
 create mode 100644 R/build_graph.R

diff --git a/R/build_graph.R b/R/build_graph.R
new file mode 100644
index 0000000..c863ba6
--- /dev/null
+++ b/R/build_graph.R
@@ -0,0 +1,202 @@
+build_graph = function(learner_ids, task_type) {
+  assert_choice(task_type, c("classif", "regr"))
+  learners_reg = c("glmnet", "kknn", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm")
+  if (task_type == "regr") {
+    assert_subset(learner_ids, learners_reg)
+  } else {
+    assert_subset(learner_ids, c(learners_reg, "lda"))
+  }
+
+  branches = list()
+  # glmnet
+  if ("glmnet" %in% learner_ids) {
+    branch_glmnet = po("removeconstants", id = "glmnet_removeconstants") %>>%
+      po("imputehist", id = "glmnet_imputehist") %>>%
+      po("imputeoor", id = "glmnet_imputeoor") %>>%
+      po("fixfactors", id = "glmnet_fixfactors") %>>%
+      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "glmnet_imputesample") %>>%
+      po("collapsefactors", target_level_count = 100, id = "glmnet_collapse") %>>%
+      po("encode", method = "one-hot", id = "glmnet_encode") %>>%
+      po("removeconstants", id = "glmnet_post_removeconstants") %>>%
+      lrn(paste0(task_type, ".glmnet"), id = "glmnet")
+    branches = c(branches, branch_glmnet)
+  }
+
+  # kknn
+  if ("kknn" %in% learner_ids) {
+    branch_kknn = po("removeconstants", id = "kknn_removeconstants") %>>%
+      po("imputehist", id = "kknn_imputehist") %>>%
+      po("imputeoor", id = "kknn_imputeoor") %>>%
+      po("fixfactors", id = "kknn_fixfactors") %>>%
+      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "kknn_imputesample") %>>%
+      po("collapsefactors", target_level_count = 100, id = "kknn_collapse") %>>%
+      po("removeconstants", id = "kknn_post_removeconstants") %>>%
+      lrn(paste0(task_type, ".kknn"), id = "kknn")
+    branches = c(branches, branch_kknn)
+  }
+
+  # lda
+  # only for classification
+  if ("lda" %in% learner_ids) {
+    branch_lda = po("removeconstants", id = "lda_removeconstants") %>>%
+      po("imputehist", id = "lda_imputehist") %>>%
+      po("imputeoor", id = "lda_imputeoor") %>>%
+      po("fixfactors", id = "lda_fixfactors") %>>%
+      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "lda_imputesample") %>>%
+      po("collapsefactors", target_level_count = 100, id = "lda_collapse") %>>%
+      po("removeconstants", id = "lda_post_removeconstants") %>>%
+      lrn("classif.lda", id = "lda")
+    branches = c(branches, branch_lda)
+  }
+
+  # nnet
+  if ("nnet" %in% learner_ids) {
+    branch_nnet = po("removeconstants", id = "nnet_removeconstants") %>>%
+      po("imputehist", id = "nnet_imputehist") %>>%
+      po("imputeoor", id = "nnet_imputeoor") %>>%
+      po("fixfactors", id = "nnet_fixfactors") %>>%
+      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "nnet_imputesample") %>>%
+      po("collapsefactors", target_level_count = 100, id = "nnet_collapse") %>>%
+      po("removeconstants", id = "nnet_post_removeconstants") %>>%
+      lrn(paste0(task_type, ".nnet"), id = "nnet")
+    branches = c(branches, branch_nnet)
+  }
+
+  # ranger
+  if ("ranger" %in% learner_ids) {
+    branch_ranger = po("removeconstants", id = "ranger_removeconstants") %>>%
+      po("imputeoor", id = "ranger_imputeoor") %>>%
+      po("fixfactors", id = "ranger_fixfactors") %>>%
+      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "ranger_imputesample") %>>%
+      po("collapsefactors", target_level_count = 100, id = "ranger_collapse") %>>%
+      po("removeconstants", id = "ranger_post_removeconstants") %>>%
+      # use upper bound of search space for memory estimation
+      lrn(paste0(task_type, ".ranger"), id = "ranger", num.trees = 2000)
+    branches = c(branches, branch_ranger)
+  }
+
+  # svm
+  if ("svm" %in% learner_ids) {
+    svm_type = if (task_type == "classif") {
+      "C-classification"
+    } else {
+      "eps-regression"
+    }
+    branch_svm = po("removeconstants", id = "svm_removeconstants") %>>%
+      po("imputehist", id = "svm_imputehist") %>>%
+      po("imputeoor", id = "svm_imputeoor") %>>%
+      po("fixfactors", id = "svm_fixfactors") %>>%
+      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "svm_imputesample") %>>%
+      po("collapsefactors", target_level_count = 100, id = "svm_collapse") %>>%
+      po("encode", method = "one-hot", id = "svm_encode") %>>%
+      po("removeconstants", id = "svm_post_removeconstants") %>>%
+      lrn(paste0(task_type, ".svm"), id = "svm", type = svm_type)
+    branches = c(branches, branch_svm)
+  }
+
+  # xgboost
+  if ("xgboost" %in% learner_ids) {
+    branch_xgboost = po("removeconstants", id = "xgboost_removeconstants") %>>%
+      po("imputeoor", id = "xgboost_imputeoor") %>>%
+      po("fixfactors", id = "xgboost_fixfactors") %>>%
+      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "xgboost_imputesample") %>>%
+      po("encodeimpact", id = "xgboost_encode") %>>%
+      po("removeconstants", id = "xgboost_post_removeconstants") %>>%
+      lrn(paste0(task_type, ".xgboost"), id = "xgboost", nrounds = 5000, early_stopping_rounds = 10)
+    branches = c(branches, branch_xgboost)
+  }
+
+  # catboost
+  if ("catboost" %in% learner_ids) {
+    branch_catboost = po("colapply", id = "catboost_colapply", applicator = as.numeric, affect_columns = selector_type("integer")) %>>%
+      lrn(paste0(task_type, ".catboost"), id = "catboost", iterations = 500, early_stopping_rounds = 10, use_best_model = TRUE)
+    branches = c(branches, branch_catboost)
+  }
+
+  # extra trees
+  if ("extra_trees" %in% learner_ids) {
+    branch_extra_trees = po("removeconstants", id = "extra_trees_removeconstants") %>>%
+      po("imputeoor", id = "extra_trees_imputeoor") %>>%
+      po("fixfactors", id = "extra_trees_fixfactors") %>>%
+      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "extra_trees_imputesample") %>>%
+      po("collapsefactors", target_level_count = 40, id = "extra_trees_collapse") %>>%
+      po("removeconstants", id = "extra_trees_post_removeconstants") %>>%
+      lrn(paste0(task_type, ".ranger"), id = "extra_trees", splitrule = "extratrees", num.trees = 100, replace = FALSE, sample.fraction = 1)
+    branches = c(branches, branch_extra_trees)
+  }
+
+  # lightgbm
+  if ("lightgbm" %in% learner_ids) {
+    branch_lightgbm = lrn(paste0(task_type, ".lightgbm"), id = "lightgbm", num_iterations = 5000, early_stopping_rounds = 10)
+    branches = c(branches, branch_lightgbm)
+  }
+
+  # branch graph
+  po("branch", options = learner_ids) %>>%
+    gunion(branches) %>>%
+    po("unbranch", options = learner_ids)
+}
+
+tuning_space = list(
+  glmnet = list(
+    glmnet.s     = to_tune(1e-4, 1e4, logscale = TRUE),
+    glmnet.alpha = to_tune(0, 1)
+  ),
+
+  kknn = list(
+    kknn.k = to_tune(1, 50, logscale = TRUE),
+    kknn.distance = to_tune(1, 5),
+    kknn.kernel = to_tune(c("rectangular", "optimal", "epanechnikov", "biweight", "triweight", "cos",  "inv",  "gaussian", "rank"))
+  ),
+
+  lda = list(),
+
+  extra_trees = list(),
+
+  nnet = list(
+      nnet.maxit = to_tune(1e1, 1e3, logscale = TRUE),
+      nnet.decay = to_tune(1e-4, 1e-1, logscale = TRUE),
+      nnet.size  = to_tune(2, 50, logscale = TRUE)
+  ),
+
+  ranger = list(
+    ranger.mtry.ratio      = to_tune(0, 1),
+    ranger.replace         = to_tune(),
+    ranger.sample.fraction = to_tune(1e-1, 1),
+    ranger.num.trees       = to_tune(500, 2000)
+  ),
+
+  svm = list(
+    svm.cost    = to_tune(1e-4, 1e4, logscale = TRUE),
+    svm.kernel  = to_tune(c("polynomial", "radial", "sigmoid", "linear")),
+    svm.degree  = to_tune(2, 5),
+    svm.gamma   = to_tune(1e-4, 1e4, logscale = TRUE)
+  ),
+
+  xgboost = list(
+    xgboost.eta               = to_tune(1e-4, 1, logscale = TRUE),
+    xgboost.max_depth         = to_tune(1, 20),
+    xgboost.colsample_bytree  = to_tune(1e-1, 1),
+    xgboost.colsample_bylevel = to_tune(1e-1, 1),
+    xgboost.lambda            = to_tune(1e-3, 1e3, logscale = TRUE),
+    xgboost.alpha             = to_tune(1e-3, 1e3, logscale = TRUE),
+    xgboost.subsample         = to_tune(1e-1, 1),
+    xgboost.nrounds           = to_tune(1, 5000, internal = TRUE)
+  ),
+
+  catboost = list(
+    catboost.depth          = to_tune(5, 8),
+    catboost.learning_rate  = to_tune(5e-3, 0.2, logscale = TRUE),
+    catboost.l2_leaf_reg    = to_tune(1, 5),
+    catboost.iterations     = to_tune(1, 500, internal = TRUE)
+  ),
+
+
+  lightgbm = list(
+    lightgbm.learning_rate    = to_tune(5e-3, 0.2, logscale = TRUE),
+    lightgbm.feature_fraction = to_tune(0.75, 1),
+    lightgbm.min_data_in_leaf = to_tune(2, 60),
+    lightgbm.num_leaves       = to_tune(16, 96),
+    lightgbm.num_iterations   = to_tune(1, 5000, internal = TRUE)
+  )
+)
\ No newline at end of file

From 29271640423914d4330c4391033aa538b08808ff Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Fri, 26 Jul 2024 12:15:44 +0200
Subject: [PATCH 06/34] refactor: train

---
 DESCRIPTION            |   2 +
 R/LearnerClassifAuto.R | 218 +----------------------------------------
 R/LearnerRegrAuto.R    | 154 +----------------------------
 R/helper.R             | 139 --------------------------
 R/train_auto.R         | 155 +++++++++++++++++++++++++++++
 5 files changed, 159 insertions(+), 509 deletions(-)
 create mode 100644 R/train_auto.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 4ab70c0..dfc53bb 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -62,4 +62,6 @@ Collate:
     'LearnerClassifAutoXgboost.R'
     'LearnerRegrAuto.R'
     'helper.R'
+    'build_graph.R'
+    'train_auto.R'
     'zzz.R'
diff --git a/R/LearnerClassifAuto.R b/R/LearnerClassifAuto.R
index 132ab31..1d740b0 100644
--- a/R/LearnerClassifAuto.R
+++ b/R/LearnerClassifAuto.R
@@ -87,159 +87,7 @@ LearnerClassifAuto = R6Class("LearnerClassifAuto",
 
   private = list(
    .train = function(task) {
-      pv = self$param_set$values
-      learner_ids = pv$learner_ids
-      self$graph = build_graph(learner_ids, task_type = "classif")
-      self$tuning_space = tuning_space[learner_ids]
-
-      lg$debug("Training '%s' on task '%s'", self$id, task$id)
-
-      # initialize mbo tuner
-      tuner = tnr("adbo")
-
-      # remove learner based on memory limit
-      lg$debug("Starting to select from %i learners: %s", length(learner_ids), paste0(learner_ids, collapse = ","))
-
-      if (!is.null(pv$max_memory)) {
-        memory_usage = map_dbl(learner_ids, function(learner_id) {
-          self$graph$pipeops[[learner_id]]$learner$estimate_memory_usage(task) / 1e6
-        })
-        learner_ids = learner_ids[memory_usage < pv$max_memory]
-        lg$debug("Checking learners for memory limit of %i MB. Keeping %i learner(s): %s", pv$max_memory, length(learner_ids), paste0(learner_ids, collapse = ","))
-      }
-
-      # set number of threads
-      if (!is.null(pv$max_nthread)) {
-        lg$debug("Setting number of threads per learner to %i", pv$max_nthread)
-        walk(learner_ids, function(learner_id) {
-          set_threads(self$graph$pipeops[[learner_id]]$learner, pv$max_nthread)
-        })
-      }
-
-      # reduce number of workers on large data sets
-      if (!is.null(pv$large_data_size) && task$nrow * task$ncol > pv$large_data_size) {
-        lg$debug("Task size larger than %i rows", pv$large_data_size)
-
-        learner_ids = intersect(learner_ids, pv$large_data_learner_ids)
-        self$tuning_space = tuning_space[learner_ids]
-        lg$debug("Keeping %i learner(s): %s", length(learner_ids), paste0(learner_ids, collapse = ","))
-
-        lg$debug("Increasing number of threads per learner to %i", pv$large_data_nthread)
-        walk(learner_ids, function(learner_id) {
-          set_threads(self$graph$pipeops[[learner_id]]$learner, pv$large_data_nthread)
-        })
-        n_workers = rush_config()$n_workers
-        n = max(1, floor(n_workers / pv$large_data_nthread))
-        tuner$param_set$set_values(n_workers = n)
-        lg$debug("Reducing number of workers to %i", n)
-      }
-
-      # small data resampling
-      resampling = if (!is.null(pv$small_data_size) && task$nrow < pv$small_data_size) {
-        lg$debug("Task has less than %i rows", pv$small_data_size)
-        lg$debug("Using small data set resampling with %i iterations", pv$small_data_resampling$iters)
-        pv$small_data_resampling
-      } else {
-        pv$resampling
-      }
-
-      # cardinality
-      cardinality = map_int(task$col_info$levels, length)
-      if (!is.null(pv$max_cardinality) && any(cardinality > pv$max_cardinality)) {
-        lg$debug("Reducing number of factor levels to %i", pv$max_cardinality)
-
-        # collapse factors
-        pipeop_ids = names(self$graph$pipeops)
-        pipeop_ids = pipeop_ids[grep("collapse", pipeop_ids)]
-        walk(pipeop_ids, function(pipeop_id) {
-          self$graph$pipeops[[pipeop_id]]$param_set$values$target_level_count = pv$max_cardinality
-        })
-      }
-
-      if ("extra_trees" %in% learner_ids && any(cardinality > pv$extra_trees_max_cardinality))  {
-        lg$debug("Reducing number of factor levels to %i for extra trees", pv$extra_trees_max_cardinality)
-        self$graph$pipeops$extra_trees_collapse$param_set$values$target_level_count = pv$extra_trees_max_cardinality
-      }
-
-      # initialize graph learner
-      graph_learner = as_learner(self$graph)
-      graph_learner$id = "graph_learner"
-      graph_learner$predict_type = pv$measure$predict_type
-      graph_learner$fallback = lrn("classif.featureless", predict_type = pv$measure$predict_type)
-      graph_learner$encapsulate = c(train = "callr", predict = "callr")
-      graph_learner$timeout = c(train = pv$learner_timeout, predict = pv$learner_timeout)
-
-      learners_with_validation = intersect(learner_ids, c("xgboost", "catboost", "lightgbm"))
-      if (length(learners_with_validation)) {
-        set_validate(graph_learner, "test", ids = learners_with_validation)
-      }
-
-      # set early stopping
-      if ("xgboost" %in% learner_ids) {
-        graph_learner$param_set$values$xgboost.callbacks = list(cb_timeout_xgboost(pv$learner_timeout * 0.8))
-        graph_learner$param_set$values$xgboost.eval_metric = pv$xgboost_eval_metric
-      }
-      if ("catboost" %in% learner_ids) {
-        graph_learner$param_set$values$catboost.eval_metric = pv$catboost_eval_metric
-      }
-      if ("lightgbm" %in% learner_ids) {
-        graph_learner$param_set$values$lightgbm.callbacks = list(cb_timeout_lightgbm(pv$learner_timeout * 0.8))
-        graph_learner$param_set$values$lightgbm.eval = pv$lightgbm_eval_metric
-      }
-
-      # initialize search space
-      tuning_space = unlist(unname(self$tuning_space), recursive = FALSE)
-      graph_scratch = graph_learner$clone(deep = TRUE)
-      graph_scratch$param_set$set_values(.values = tuning_space)
-      graph_scratch$param_set$set_values(branch.selection = to_tune(learner_ids))
-      search_space = graph_scratch$param_set$search_space()
-      walk(learner_ids, function(learner_id) {
-        param_ids = search_space$ids()
-        param_ids = grep(paste0("^", learner_id), param_ids, value = TRUE)
-        walk(param_ids, function(param_id) {
-          # skip internal tuning parameter
-          if (param_id %in% c("xgboost.nrounds", "catboost.iterations", "lightgbm.num_iterations")) return()
-          search_space$add_dep(
-            id = param_id,
-            on = "branch.selection",
-            cond = CondEqual$new(learner_id)
-          )
-        })
-      })
-
-      # initial design
-      lhs_xdt = generate_lhs_design(pv$lhs_size, self$task_type, setdiff(learner_ids, c("lda", "extra_trees")), self$tuning_space)
-      default_xdt = generate_default_design(self$task_type, learner_ids, task, self$tuning_space)
-      initial_xdt = rbindlist(list(lhs_xdt, default_xdt), use.names = TRUE, fill = TRUE)
-      setorderv(initial_xdt, "branch.selection")
-      tuner$param_set$set_values(initial_design = initial_xdt)
-
-      # initialize auto tuner
-      self$instance = ti_async(
-        task = task,
-        learner = graph_learner,
-        resampling = resampling,
-        measures = pv$measure,
-        terminator = pv$terminator,
-        search_space = search_space,
-        callbacks = pv$callbacks,
-        store_benchmark_result = pv$store_benchmark_result
-      )
-
-      # tune
-      lg$debug("Learner '%s' starts tuning phase", self$id)
-      tuner$optimize(self$instance)
-
-      # fit final model
-      lg$debug("Learner '%s' fits final model", self$id)
-      if (length(learners_with_validation)) {
-        set_validate(graph_learner, NULL, ids = intersect(learner_ids, c("xgboost", "catboost", "lightgbm")))
-      }
-      graph_learner$param_set$set_values(.values = self$instance$result_learner_param_vals, .insert = FALSE)
-      graph_learner$timeout = c(train = Inf, predict = Inf)
-      graph_learner$train(task)
-
-      list(graph_learner = graph_learner, instance = self$instance)
+      train_auto(self, task, task_type = "classif")
     },
 
     .predict = function(task) {
@@ -251,67 +99,3 @@ LearnerClassifAuto = R6Class("LearnerClassifAuto",
 
 #' @include aaa.R
 learners[["classif.auto"]] = LearnerClassifAuto
-
-tuning_space = list(
-  glmnet = list(
-    glmnet.s     = to_tune(1e-4, 1e4, logscale = TRUE),
-    glmnet.alpha = to_tune(0, 1)
-  ),
-
-  kknn = list(
-    kknn.k = to_tune(1, 50, logscale = TRUE),
-    kknn.distance = to_tune(1, 5),
-    kknn.kernel = to_tune(c("rectangular", "optimal", "epanechnikov", "biweight", "triweight", "cos",  "inv",  "gaussian", "rank"))
-  ),
-
-  lda = list(),
-
-  extra_trees = list(),
-
-  nnet = list(
-      nnet.maxit = to_tune(1e1, 1e3, logscale = TRUE),
-      nnet.decay = to_tune(1e-4, 1e-1, logscale = TRUE),
-      nnet.size  = to_tune(2, 50, logscale = TRUE)
-  ),
-
-  ranger = list(
-    ranger.mtry.ratio      = to_tune(0, 1),
-    ranger.replace         = to_tune(),
-    ranger.sample.fraction = to_tune(1e-1, 1),
-    ranger.num.trees       = to_tune(500, 2000)
-  ),
-
-  svm = list(
-    svm.cost    = to_tune(1e-4, 1e4, logscale = TRUE),
-    svm.kernel  = to_tune(c("polynomial", "radial", "sigmoid", "linear")),
-    svm.degree  = to_tune(2, 5),
-    svm.gamma   = to_tune(1e-4, 1e4, logscale = TRUE)
-  ),
-
-  xgboost = list(
-    xgboost.eta               = to_tune(1e-4, 1, logscale = TRUE),
-    xgboost.max_depth         = to_tune(1, 20),
-    xgboost.colsample_bytree  = to_tune(1e-1, 1),
-    xgboost.colsample_bylevel = to_tune(1e-1, 1),
-    xgboost.lambda            = to_tune(1e-3, 1e3, logscale = TRUE),
-    xgboost.alpha             = to_tune(1e-3, 1e3, logscale = TRUE),
-    xgboost.subsample         = to_tune(1e-1, 1),
-    xgboost.nrounds           = to_tune(1, 5000, internal = TRUE)
-  ),
-
-  catboost = list(
-    catboost.depth          = to_tune(5, 8),
-    catboost.learning_rate  = to_tune(5e-3, 0.2, logscale = TRUE),
-    catboost.l2_leaf_reg    = to_tune(1, 5),
-    catboost.iterations     = to_tune(1, 500, internal = TRUE)
-  ),
-
-
-  lightgbm = list(
-    lightgbm.learning_rate    = to_tune(5e-3, 0.2, logscale = TRUE),
-    lightgbm.feature_fraction = to_tune(0.75, 1),
-    lightgbm.min_data_in_leaf = to_tune(2, 60),
-    lightgbm.num_leaves       = to_tune(16, 96),
-    lightgbm.num_iterations   = to_tune(1, 5000, internal = TRUE)
-  )
-)
diff --git a/R/LearnerRegrAuto.R b/R/LearnerRegrAuto.R
index 20ccf50..19dc8b8 100644
--- a/R/LearnerRegrAuto.R
+++ b/R/LearnerRegrAuto.R
@@ -87,159 +87,7 @@ LearnerRegrAuto = R6Class("LearnerRegrAuto",
   ),
   private = list(
     .train = function(task) {
-      pv = self$param_set$values
-      learner_ids = pv$learner_ids
-      self$graph = build_graph(learner_ids, task_type = "regr")
-      self$tuning_space = tuning_space[learner_ids]
-
-      lg$debug("Training '%s' on task '%s'", self$id, task$id)
-
-      # initialize mbo tuner
-      tuner = tnr("adbo")
-
-      # remove learner based on memory limit
-      lg$debug("Starting to select from %i learners: %s", length(learner_ids), paste0(learner_ids, collapse = ","))
-
-      if (!is.null(pv$max_memory)) {
-        memory_usage = map_dbl(learner_ids, function(learner_id) {
-          self$graph$pipeops[[learner_id]]$learner$estimate_memory_usage(task) / 1e6
-        })
-        learner_ids = learner_ids[memory_usage < pv$max_memory]
-        lg$debug("Checking learners for memory limit of %i MB. Keeping %i learner(s): %s", pv$max_memory, length(learner_ids), paste0(learner_ids, collapse = ","))
-      }
-
-      # set number of threads
-      if (!is.null(pv$max_nthread)) {
-        lg$debug("Setting number of threads per learner to %i", pv$max_nthread)
-        walk(learner_ids, function(learner_id) {
-          set_threads(self$graph$pipeops[[learner_id]]$learner, pv$max_nthread)
-        })
-      }
-
-      # reduce number of workers on large data sets
-      if (!is.null(pv$large_data_size) && task$nrow * task$ncol > pv$large_data_size) {
-        lg$debug("Task size larger than %i rows", pv$large_data_size)
-
-        learner_ids = intersect(learner_ids, pv$large_data_learner_ids)
-        self$tuning_space = tuning_space[learner_ids]
-        lg$debug("Keeping %i learner(s): %s", length(learner_ids), paste0(learner_ids, collapse = ","))
-
-        lg$debug("Increasing number of threads per learner to %i", pv$large_data_nthread)
-        walk(learner_ids, function(learner_id) {
-          set_threads(self$graph$pipeops[[learner_id]]$learner, pv$large_data_nthread)
-        })
-        n_workers = rush_config()$n_workers
-        n = max(1, floor(n_workers / pv$large_data_nthread))
-        tuner$param_set$set_values(n_workers = n)
-        lg$debug("Reducing number of workers to %i", n)
-      }
-
-      # small data resampling
-      resampling = if (!is.null(pv$small_data_size) && task$nrow < pv$small_data_size) {
-        lg$debug("Task has less than %i rows", pv$small_data_size)
-        lg$debug("Using small data set resampling with %i iterations", pv$small_data_resampling$iters)
-        pv$small_data_resampling
-      } else {
-        pv$resampling
-      }
-
-      # cardinality
-      cardinality = map_int(task$col_info$levels, length)
-      if (!is.null(pv$max_cardinality) && any(cardinality > pv$max_cardinality)) {
-        lg$debug("Reducing number of factor levels to %i", pv$max_cardinality)
-
-        # collapse factors
-        pipeop_ids = names(self$graph$pipeops)
-        pipeop_ids = pipeop_ids[grep("collapse", pipeop_ids)]
-        walk(pipeop_ids, function(pipeop_id) {
-          self$graph$pipeops[[pipeop_id]]$param_set$values$target_level_count = pv$max_cardinality
-        })
-      }
-
-      if ("extra_trees" %in% learner_ids && any(cardinality > pv$extra_trees_max_cardinality))  {
-        lg$debug("Reducing number of factor levels to %i for extra trees", pv$extra_trees_max_cardinality)
-        self$graph$pipeops$extra_trees_collapse$param_set$values$target_level_count = pv$extra_trees_max_cardinality
-      }
-
-      # initialize graph learner
-      graph_learner = as_learner(self$graph)
-      graph_learner$id = "graph_learner"
-      graph_learner$predict_type = pv$measure$predict_type
-      graph_learner$fallback = lrn("regr.featureless", predict_type = pv$measure$predict_type)
-      graph_learner$encapsulate = c(train = "callr", predict = "callr")
-      graph_learner$timeout = c(train = pv$learner_timeout, predict = pv$learner_timeout)
-
-      learners_with_validation = intersect(learner_ids, c("xgboost", "catboost", "lightgbm"))
-      if (length(learners_with_validation)) {
-        set_validate(graph_learner, "test", ids = learners_with_validation)
-      }
-
-      # set early stopping
-      if ("xgboost" %in% learner_ids) {
-        graph_learner$param_set$values$xgboost.callbacks = list(cb_timeout_xgboost(pv$learner_timeout * 0.8))
-        graph_learner$param_set$values$xgboost.eval_metric = pv$xgboost_eval_metric
-      }
-      if ("catboost" %in% learner_ids) {
-        graph_learner$param_set$values$catboost.eval_metric = pv$catboost_eval_metric
-      }
-      if ("lightgbm" %in% learner_ids) {
-        graph_learner$param_set$values$lightgbm.callbacks = list(cb_timeout_lightgbm(pv$learner_timeout * 0.8))
-        graph_learner$param_set$values$lightgbm.eval = pv$lightgbm_eval_metric
-      }
-
-      # initialize search space
-      tuning_space = unlist(unname(self$tuning_space), recursive = FALSE)
-      graph_scratch = graph_learner$clone(deep = TRUE)
-      graph_scratch$param_set$set_values(.values = tuning_space)
-      graph_scratch$param_set$set_values(branch.selection = to_tune(learner_ids))
-      search_space = graph_scratch$param_set$search_space()
-      walk(learner_ids, function(learner_id) {
-        param_ids = search_space$ids()
-        param_ids = grep(paste0("^", learner_id), param_ids, value = TRUE)
-        walk(param_ids, function(param_id) {
-          # skip internal tuning parameter
-          if (param_id %in% c("xgboost.nrounds", "catboost.iterations", "lightgbm.num_iterations")) return()
-          search_space$add_dep(
-            id = param_id,
-            on = "branch.selection",
-            cond = CondEqual$new(learner_id)
-          )
-        })
-      })
-
-      # initial design
-      lhs_xdt = generate_lhs_design(pv$lhs_size, self$task_type, setdiff(learner_ids, "extra_trees"), self$tuning_space)
-      default_xdt = generate_default_design(self$task_type, learner_ids, task, self$tuning_space)
-      initial_xdt = rbindlist(list(lhs_xdt, default_xdt), use.names = TRUE, fill = TRUE)
-      setorderv(initial_xdt, "branch.selection")
-      tuner$param_set$set_values(initial_design = initial_xdt)
-
-      # initialize auto tuner
-      self$instance = ti_async(
-        task = task,
-        learner = graph_learner,
-        resampling = resampling,
-        measures = pv$measure,
-        terminator = pv$terminator,
-        search_space = search_space,
-        callbacks = pv$callbacks,
-        store_benchmark_result = pv$store_benchmark_result
-      )
-
-      # tune
-      lg$debug("Learner '%s' starts tuning phase", self$id)
-      tuner$optimize(self$instance)
-
-      # fit final model
-      lg$debug("Learner '%s' fits final model", self$id)
-      if (length(learners_with_validation)) {
-        set_validate(graph_learner, NULL, ids = intersect(learner_ids, c("xgboost", "catboost", "lightgbm")))
-      }
-      graph_learner$param_set$set_values(.values = self$instance$result_learner_param_vals, .insert = FALSE)
-      graph_learner$timeout = c(train = Inf, predict = Inf)
-      graph_learner$train(task)
-
-      list(graph_learner = graph_learner, instance = self$instance)
+      train_auto(self, task, task_type = "regr")
     },
 
     .predict = function(task) {
diff --git a/R/helper.R b/R/helper.R
index 8d4ff40..4b27c1c 100644
--- a/R/helper.R
+++ b/R/helper.R
@@ -1,142 +1,3 @@
-build_graph = function(learner_ids, task_type) {
-  assert_choice(task_type, c("classif", "regr"))
-  learners_reg = c("glmnet", "kknn", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm")
-  if (task_type == "regr") {
-    assert_subset(learner_ids, learners_reg)
-  } else {
-    assert_subset(learner_ids, c(learners_reg, "lda"))
-  }
-
-  branches = list()
-  # glmnet
-  if ("glmnet" %in% learner_ids) {
-    branch_glmnet = po("removeconstants", id = "glmnet_removeconstants") %>>%
-      po("imputehist", id = "glmnet_imputehist") %>>%
-      po("imputeoor", id = "glmnet_imputeoor") %>>%
-      po("fixfactors", id = "glmnet_fixfactors") %>>%
-      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "glmnet_imputesample") %>>%
-      po("collapsefactors", target_level_count = 100, id = "glmnet_collapse") %>>%
-      po("encode", method = "one-hot", id = "glmnet_encode") %>>%
-      po("removeconstants", id = "glmnet_post_removeconstants") %>>%
-      lrn(paste0(task_type, ".glmnet"), id = "glmnet")
-    branches = c(branches, branch_glmnet)
-  }
-
-  # kknn
-  if ("kknn" %in% learner_ids) {
-    branch_kknn = po("removeconstants", id = "kknn_removeconstants") %>>%
-      po("imputehist", id = "kknn_imputehist") %>>%
-      po("imputeoor", id = "kknn_imputeoor") %>>%
-      po("fixfactors", id = "kknn_fixfactors") %>>%
-      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "kknn_imputesample") %>>%
-      po("collapsefactors", target_level_count = 100, id = "kknn_collapse") %>>%
-      po("removeconstants", id = "kknn_post_removeconstants") %>>%
-      lrn(paste0(task_type, ".kknn"), id = "kknn")
-    branches = c(branches, branch_kknn)
-  }
-
-  # lda
-  # only for classification
-  if ("lda" %in% learner_ids) {
-    branch_lda = po("removeconstants", id = "lda_removeconstants") %>>%
-      po("imputehist", id = "lda_imputehist") %>>%
-      po("imputeoor", id = "lda_imputeoor") %>>%
-      po("fixfactors", id = "lda_fixfactors") %>>%
-      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "lda_imputesample") %>>%
-      po("collapsefactors", target_level_count = 100, id = "lda_collapse") %>>%
-      po("removeconstants", id = "lda_post_removeconstants") %>>%
-      lrn("classif.lda", id = "lda")
-    branches = c(branches, branch_lda)
-  }
-
-  # nnet
-  if ("nnet" %in% learner_ids) {
-    branch_nnet = po("removeconstants", id = "nnet_removeconstants") %>>%
-      po("imputehist", id = "nnet_imputehist") %>>%
-      po("imputeoor", id = "nnet_imputeoor") %>>%
-      po("fixfactors", id = "nnet_fixfactors") %>>%
-      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "nnet_imputesample") %>>%
-      po("collapsefactors", target_level_count = 100, id = "nnet_collapse") %>>%
-      po("removeconstants", id = "nnet_post_removeconstants") %>>%
-      lrn(paste0(task_type, ".nnet"), id = "nnet")
-    branches = c(branches, branch_nnet)
-  }
-
-  # ranger
-  if ("ranger" %in% learner_ids) {
-    branch_ranger = po("removeconstants", id = "ranger_removeconstants") %>>%
-      po("imputeoor", id = "ranger_imputeoor") %>>%
-      po("fixfactors", id = "ranger_fixfactors") %>>%
-      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "ranger_imputesample") %>>%
-      po("collapsefactors", target_level_count = 100, id = "ranger_collapse") %>>%
-      po("removeconstants", id = "ranger_post_removeconstants") %>>%
-      # use upper bound of search space for memory estimation
-      lrn(paste0(task_type, ".ranger"), id = "ranger", num.trees = 2000)
-    branches = c(branches, branch_ranger)
-  }
-
-  # svm
-  if ("svm" %in% learner_ids) {
-    svm_type = if (task_type == "classif") {
-      "C-classification"
-    } else {
-      "eps-regression"
-    }
-    branch_svm = po("removeconstants", id = "svm_removeconstants") %>>%
-      po("imputehist", id = "svm_imputehist") %>>%
-      po("imputeoor", id = "svm_imputeoor") %>>%
-      po("fixfactors", id = "svm_fixfactors") %>>%
-      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "svm_imputesample") %>>%
-      po("collapsefactors", target_level_count = 100, id = "svm_collapse") %>>%
-      po("encode", method = "one-hot", id = "svm_encode") %>>%
-      po("removeconstants", id = "svm_post_removeconstants") %>>%
-      lrn(paste0(task_type, ".svm"), id = "svm", type = svm_type)
-    branches = c(branches, branch_svm)
-  }
-
-  # xgboost
-  if ("xgboost" %in% learner_ids) {
-    branch_xgboost = po("removeconstants", id = "xgboost_removeconstants") %>>%
-      po("imputeoor", id = "xgboost_imputeoor") %>>%
-      po("fixfactors", id = "xgboost_fixfactors") %>>%
-      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "xgboost_imputesample") %>>%
-      po("encodeimpact", id = "xgboost_encode") %>>%
-      po("removeconstants", id = "xgboost_post_removeconstants") %>>%
-      lrn(paste0(task_type, ".xgboost"), id = "xgboost", nrounds = 5000, early_stopping_rounds = 10)
-    branches = c(branches, branch_xgboost)
-  }
-
-  # catboost
-  if ("catboost" %in% learner_ids) {
-    branch_catboost = po("colapply", id = "catboost_colapply", applicator = as.numeric, affect_columns = selector_type("integer")) %>>%
-      lrn(paste0(task_type, ".catboost"), id = "catboost", iterations = 500, early_stopping_rounds = 10, use_best_model = TRUE)
-    branches = c(branches, branch_catboost)
-  }
-
-  # extra trees
-  if ("extra_trees" %in% learner_ids) {
-    branch_extra_trees = po("removeconstants", id = "extra_trees_removeconstants") %>>%
-      po("imputeoor", id = "extra_trees_imputeoor") %>>%
-      po("fixfactors", id = "extra_trees_fixfactors") %>>%
-      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "extra_trees_imputesample") %>>%
-      po("collapsefactors", target_level_count = 40, id = "extra_trees_collapse") %>>%
-      po("removeconstants", id = "extra_trees_post_removeconstants") %>>%
-      lrn(paste0(task_type, ".ranger"), id = "extra_trees", splitrule = "extratrees", num.trees = 100, replace = FALSE, sample.fraction = 1)
-    branches = c(branches, branch_extra_trees)
-  }
-
-  # lightgbm
-  if ("lightgbm" %in% learner_ids) {
-    branch_lightgbm = lrn(paste0(task_type, ".lightgbm"), id = "lightgbm", num_iterations = 5000, early_stopping_rounds = 10)
-    branches = c(branches, branch_lightgbm)
-  }
-
-  # branch graph
-  po("branch", options = learner_ids) %>>%
-    gunion(branches) %>>%
-    po("unbranch", options = learner_ids)
-}
-
 generate_default_design = function(task_type, learner_ids, task, tuning_space, branch = TRUE) {
   map_dtr(learner_ids, function(learner_id) {
     if (paste0(task_type, ".", learner_id) %nin% mlr_learners$keys()) {
diff --git a/R/train_auto.R b/R/train_auto.R
new file mode 100644
index 0000000..2ff174f
--- /dev/null
+++ b/R/train_auto.R
@@ -0,0 +1,155 @@
+train_auto = function(self, task, task_type) {
+  pv = self$param_set$values
+  learner_ids = pv$learner_ids
+  self$graph = build_graph(learner_ids, task_type)
+  self$tuning_space = tuning_space[learner_ids]
+
+  lg$debug("Training '%s' on task '%s'", self$id, task$id)
+
+  # initialize mbo tuner
+  tuner = tnr("adbo")
+
+  # remove learner based on memory limit
+  lg$debug("Starting to select from %i learners: %s", length(learner_ids), paste0(learner_ids, collapse = ","))
+
+  if (!is.null(pv$max_memory)) {
+    memory_usage = map_dbl(learner_ids, function(learner_id) {
+      self$graph$pipeops[[learner_id]]$learner$estimate_memory_usage(task) / 1e6
+    })
+    learner_ids = learner_ids[memory_usage < pv$max_memory]
+    lg$debug("Checking learners for memory limit of %i MB. Keeping %i learner(s): %s", pv$max_memory, length(learner_ids), paste0(learner_ids, collapse = ","))
+  }
+
+  # set number of threads
+  if (!is.null(pv$max_nthread)) {
+    lg$debug("Setting number of threads per learner to %i", pv$max_nthread)
+    walk(learner_ids, function(learner_id) {
+      set_threads(self$graph$pipeops[[learner_id]]$learner, pv$max_nthread)
+    })
+  }
+
+  # reduce number of workers on large data sets
+  if (!is.null(pv$large_data_size) && task$nrow * task$ncol > pv$large_data_size) {
+    lg$debug("Task size larger than %i rows", pv$large_data_size)
+
+    learner_ids = intersect(learner_ids, pv$large_data_learner_ids)
+    self$tuning_space = tuning_space[learner_ids]
+    lg$debug("Keeping %i learner(s): %s", length(learner_ids), paste0(learner_ids, collapse = ","))
+
+    lg$debug("Increasing number of threads per learner to %i", pv$large_data_nthread)
+    walk(learner_ids, function(learner_id) {
+      set_threads(self$graph$pipeops[[learner_id]]$learner, pv$large_data_nthread)
+    })
+    n_workers = rush_config()$n_workers
+    n = max(1, floor(n_workers / pv$large_data_nthread))
+    tuner$param_set$set_values(n_workers = n)
+    lg$debug("Reducing number of workers to %i", n)
+  }
+
+  # small data resampling
+  resampling = if (!is.null(pv$small_data_size) && task$nrow < pv$small_data_size) {
+    lg$debug("Task has less than %i rows", pv$small_data_size)
+    lg$debug("Using small data set resampling with %i iterations", pv$small_data_resampling$iters)
+    pv$small_data_resampling
+  } else {
+    pv$resampling
+  }
+
+  # cardinality
+  cardinality = map_int(task$col_info$levels, length)
+  if (!is.null(pv$max_cardinality) && any(cardinality > pv$max_cardinality)) {
+    lg$debug("Reducing number of factor levels to %i", pv$max_cardinality)
+
+    # collapse factors
+    pipeop_ids = names(self$graph$pipeops)
+    pipeop_ids = pipeop_ids[grep("collapse", pipeop_ids)]
+    walk(pipeop_ids, function(pipeop_id) {
+      self$graph$pipeops[[pipeop_id]]$param_set$values$target_level_count = pv$max_cardinality
+    })
+  }
+
+  if ("extra_trees" %in% learner_ids && any(cardinality > pv$extra_trees_max_cardinality))  {
+    lg$debug("Reducing number of factor levels to %i for extra trees", pv$extra_trees_max_cardinality)
+    self$graph$pipeops$extra_trees_collapse$param_set$values$target_level_count = pv$extra_trees_max_cardinality
+  }
+
+  # initialize graph learner
+  graph_learner = as_learner(self$graph)
+  graph_learner$id = "graph_learner"
+  graph_learner$predict_type = pv$measure$predict_type
+  graph_learner$fallback = lrn(paste0(task_type, ".featureless"), predict_type = pv$measure$predict_type)
+  graph_learner$encapsulate = c(train = "callr", predict = "callr")
+  graph_learner$timeout = c(train = pv$learner_timeout, predict = pv$learner_timeout)
+
+  learners_with_validation = intersect(learner_ids, c("xgboost", "catboost", "lightgbm"))
+  if (length(learners_with_validation)) {
+    set_validate(graph_learner, "test", ids = learners_with_validation)
+  }
+
+  # set early stopping
+  if ("xgboost" %in% learner_ids) {
+    graph_learner$param_set$values$xgboost.callbacks = list(cb_timeout_xgboost(pv$learner_timeout * 0.8))
+    graph_learner$param_set$values$xgboost.eval_metric = pv$xgboost_eval_metric
+  }
+  if ("catboost" %in% learner_ids) {
+    graph_learner$param_set$values$catboost.eval_metric = pv$catboost_eval_metric
+  }
+  if ("lightgbm" %in% learner_ids) {
+    graph_learner$param_set$values$lightgbm.callbacks = list(cb_timeout_lightgbm(pv$learner_timeout * 0.8))
+    graph_learner$param_set$values$lightgbm.eval = pv$lightgbm_eval_metric
+  }
+
+  # initialize search space
+  tuning_space = unlist(unname(self$tuning_space), recursive = FALSE)
+  graph_scratch = graph_learner$clone(deep = TRUE)
+  graph_scratch$param_set$set_values(.values = tuning_space)
+  graph_scratch$param_set$set_values(branch.selection = to_tune(learner_ids))
+  search_space = graph_scratch$param_set$search_space()
+  walk(learner_ids, function(learner_id) {
+    param_ids = search_space$ids()
+    param_ids = grep(paste0("^", learner_id), param_ids, value = TRUE)
+    walk(param_ids, function(param_id) {
+      # skip internal tuning parameter
+      if (param_id %in% c("xgboost.nrounds", "catboost.iterations", "lightgbm.num_iterations")) return()
+      search_space$add_dep(
+        id = param_id,
+        on = "branch.selection",
+        cond = CondEqual$new(learner_id)
+      )
+    })
+  })
+
+  # initial design
+  lhs_xdt = generate_lhs_design(pv$lhs_size, self$task_type, setdiff(learner_ids, c("lda", "extra_trees")), self$tuning_space)
+  default_xdt = generate_default_design(self$task_type, learner_ids, task, self$tuning_space)
+  initial_xdt = rbindlist(list(lhs_xdt, default_xdt), use.names = TRUE, fill = TRUE)
+  setorderv(initial_xdt, "branch.selection")
+  tuner$param_set$set_values(initial_design = initial_xdt)
+
+  # initialize auto tuner
+  self$instance = ti_async(
+    task = task,
+    learner = graph_learner,
+    resampling = resampling,
+    measures = pv$measure,
+    terminator = pv$terminator,
+    search_space = search_space,
+    callbacks = pv$callbacks,
+    store_benchmark_result = pv$store_benchmark_result
+  )
+
+  # tune
+  lg$debug("Learner '%s' starts tuning phase", self$id)
+  tuner$optimize(self$instance)
+
+  # fit final model
+  lg$debug("Learner '%s' fits final model", self$id)
+  if (length(learners_with_validation)) {
+    set_validate(graph_learner, NULL, ids = intersect(learner_ids, c("xgboost", "catboost", "lightgbm")))
+  }
+  graph_learner$param_set$set_values(.values = self$instance$result_learner_param_vals, .insert = FALSE)
+  graph_learner$timeout = c(train = Inf, predict = Inf)
+  graph_learner$train(task)
+
+  list(graph_learner = graph_learner, instance = self$instance)
+}
\ No newline at end of file

From 90c47570afba3c13c1870f997f21ad808068b756 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Mon, 29 Jul 2024 15:57:41 +0200
Subject: [PATCH 07/34] fix: regr learner id

---
 R/LearnerRegrAuto.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/LearnerRegrAuto.R b/R/LearnerRegrAuto.R
index 19dc8b8..6d93d86 100644
--- a/R/LearnerRegrAuto.R
+++ b/R/LearnerRegrAuto.R
@@ -22,7 +22,7 @@ LearnerRegrAuto = R6Class("LearnerRegrAuto",
 
     #' @description
     #' Creates a new instance of this [R6][R6::R6Class] class.
-    initialize = function(id = "classif.auto") {
+    initialize = function(id = "regr.auto") {
       param_set = ps(
         # learner
         learner_ids = p_uty(default = c("glmnet", "kknn", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm"),

From 953940be71d0ad0297761f6fd95e7b75f5c3f373 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Mon, 29 Jul 2024 16:27:14 +0200
Subject: [PATCH 08/34] test: extra_trees and glmnet

---
 tests/testthat/test_LearnerClassifAuto.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/testthat/test_LearnerClassifAuto.R b/tests/testthat/test_LearnerClassifAuto.R
index 7eebf33..4ea0f26 100644
--- a/tests/testthat/test_LearnerClassifAuto.R
+++ b/tests/testthat/test_LearnerClassifAuto.R
@@ -211,7 +211,7 @@ test_that("extra_trees and glmnet works", {
   )
 
   expect_class(learner$train(task), "LearnerClassifAuto")
-  expect_equal(learner$model$instance$result$branch.selection, "extra_trees")
+  expect_choice(learner$model$instance$result$branch.selection, c("extra_trees", "glmnet"))
 })
 
 test_that("lightgbm works", {

From 6d241534b8a0416483555d8c5cae9c4cffe0a106 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Sun, 4 Aug 2024 21:04:30 +0200
Subject: [PATCH 09/34] feat: configspace

---
 DESCRIPTION                             |   4 +-
 R/save_deepcave_run.R                   | 107 ++++++++++++++++++++++++
 tests/testthat/test_save_deepcave_run.R |  24 ++++++
 3 files changed, 134 insertions(+), 1 deletion(-)
 create mode 100644 R/save_deepcave_run.R
 create mode 100644 tests/testthat/test_save_deepcave_run.R

diff --git a/DESCRIPTION b/DESCRIPTION
index dfc53bb..855771c 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -27,7 +27,8 @@ Imports:
     mlr3tuningspaces,
     paradox (>= 1.0.1),
     R6,
-    utils
+    utils,
+    jsonlite
 Suggests:
     catboost,
     e1071,
@@ -63,5 +64,6 @@ Collate:
     'LearnerRegrAuto.R'
     'helper.R'
     'build_graph.R'
+    'save_deepcave_run.R'
     'train_auto.R'
     'zzz.R'
diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R
new file mode 100644
index 0000000..f5c7453
--- /dev/null
+++ b/R/save_deepcave_run.R
@@ -0,0 +1,107 @@
+save_deepcave_run = function(learner, path = "logs/mlr3automl") {
+  jsonlite::write_json(
+    get_configspace(learner),
+    paste0(path, "/configspace.json"),
+    auto_unbox = TRUE, pretty = TRUE, null = "null"
+  )
+  
+  jsonlite::write_json(
+    get_configs(learner),
+    paste0(path, "/configs.json"),
+    auto_unbox = TRUE, pretty = TRUE, null = "null"
+  )
+
+  jsonlite::write_json(
+    get_history(learner),
+    paste0(path, "/history.json"),
+    auto_unbox = TRUE, pretty = TRUE, null = "null"
+  )
+
+  jsonlite::write_json(
+    get_meta(learner),
+    paste0(path, "/meta.json"),
+    auto_unbox = TRUE, pretty = TRUE, null = "null"
+  )
+  
+  # origins.json
+  origins = rep(list(NULL), learner$instance$archive$n_evals)
+  names(origins) = seq(learner$instance$archive$n_evals) - 1
+  jsonlite::write_json(
+    origins,
+    paste0(path, "/origins.json"),
+    pretty = TRUE, null = "null"
+  )
+}
+
+get_configspace = function(learner) {
+  n_params = nrow(learner$instance$search_space$data)
+
+  hyperparameters_list = lapply(seq_len(n_params), function(i) {
+    row = search_space$data[i, ]
+    name = row[["id"]]
+    type = switch(row[["class"]],
+      ParamFct = "categorical",
+      ParamLgl = "categorical",
+      ParamDbl = "uniform_float",
+      ParamInt = "uniform_int")
+    
+    # categorical params
+    if (type == "categorical") {
+      choices = unlist(row[["levels"]])
+      # TBD: default
+      return(list(
+        name = name,
+        type = type,
+        choices = choices,
+        weights = NULL))
+    }
+
+    # int / float params
+    is_logscale = search_space$is_logscale[[name]]
+    lower = row[["lower"]]
+    upper = row[["upper"]]
+    if (is_logscale) {
+      lower = exp(lower)
+      upper = exp(upper)
+    }
+    # TBD: default
+    return(list(
+      name = name,
+      type = type,
+      log = is_logscale,
+      lower = lower,
+      upper = upper))
+  })
+
+  conditions_list = lapply(seq_len(n_params), function(i) {
+    row = search_space$deps[i, ]
+    child = row[["id"]]
+    parent = row[["on"]]
+    
+    # `cond` (below) is a list of `Condition`s. Currently, there are only 'CondEqual' and 'CondAnyOf',
+    # which should not be used simultaneously. So this list should always contain only one entry.
+    cond = row[["cond"]][[1]]
+    if (is(cond, "CondEqual")) {
+        return(list(child = child, parent = parent, type = "EQ", value = cond$rhs))
+      }
+      return(list(child = child, parent = parent, type = "IN", values = cond$rhs))
+  })
+
+  return(list(
+    hyperparameters = hyperparameters_list,
+    conditions = conditions_list,
+    forbiddens = list()
+  ))
+}
+
+get_configs = function(learner){
+  list(TBD = "TBD")
+}
+
+get_history = function(learner){
+  list(TBD = "TBD")
+}
+
+get_meta = function(learner){
+  list(TBD = "TBD")
+}
diff --git a/tests/testthat/test_save_deepcave_run.R b/tests/testthat/test_save_deepcave_run.R
new file mode 100644
index 0000000..05befdf
--- /dev/null
+++ b/tests/testthat/test_save_deepcave_run.R
@@ -0,0 +1,24 @@
+test_that("run is saved", {
+  rush_plan(n_workers = 2)
+  skip_if_not_installed("e1071")
+
+  task = tsk("penguins")
+  learner = lrn("classif.auto",
+    learner_ids = "svm",
+    small_data_size = 1,
+    resampling = rsmp("holdout"),
+    measure = msr("classif.ce"),
+    terminator = trm("evals", n_evals = 6)
+  )
+  learner$train(task)
+
+  dir = tempdir()
+
+  save_deepcave_run(learner, path = paste0(dir))
+
+  expect_file_exists(paste0(dir, "/configspace.json"))
+  expect_file_exists(paste0(dir, "/configs.json"))
+  expect_file_exists(paste0(dir, "/history.json"))
+  expect_file_exists(paste0(dir, "/meta.json"))
+  expect_file_exists(paste0(dir, "/origins.json"))
+})

From 27289f06d957ebba93a6bce4f5e99ba1f3a9f38b Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Thu, 8 Aug 2024 19:34:20 +0200
Subject: [PATCH 10/34] docs: save deepcave

---
 R/save_deepcave_run.R | 94 +++++++++++++++++++++++++++++++++----------
 1 file changed, 72 insertions(+), 22 deletions(-)

diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R
index f5c7453..8d291fa 100644
--- a/R/save_deepcave_run.R
+++ b/R/save_deepcave_run.R
@@ -1,40 +1,91 @@
-save_deepcave_run = function(learner, path = "logs/mlr3automl") {
+#' @title Save Tuning History as a DeepCAVE Run
+#' 
+#' @description
+#' Exports information stored in a `TuningInstance` in a format recognized by [DeepCAVE](https://automl.github.io/DeepCAVE/main/index.html) as a run. Each run is stored as a folder containing five files `configs.json`, `configspace.json`, `history.jsonl`, `meta.json`, and `origins.json`.
+#' 
+#' @param instance ([TuningInstanceAsyncSingleCrit])
+#' Tuning instance to save.
+#' 
+#' @param path (`character(1)`)
+#' Path to save the run. Defaults to `"logs/mlr3automl`.
+#' 
+#' @param prefix (`character(1)`)
+#' Prefix for the name of a new subfolder under `path` for storing the current run.
+#' 
+#' @param overwrite (`character(1)`)
+#' If `FALSE` (default), creates a new subfolder to save the current run. If `TRUE`, all existing runs will be deleted.
+#' 
+#' @export
+save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", overwrite = FALSE) {
+  # don't save untuned instance
+  if (!length(instance$result_learner_param_vals)) {
+    warning("No run is saved, because no tuning has been completed.")
+    return()
+  }
+
+  # create a subfolder for saving the current run
+  # original Python implementation see `Recorder._set_path()`
+  # (https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/recorder.py)
+  if (!overwrite) {
+    new_idx = 0
+    for (fn in list.files(path)) {
+      if (!startsWith(fn, "prefix")) next
+      idx = last(strsplit(fn, "_")[[1]])
+      if (is.numeric(idx)) {
+        idx_int = as.integer(idx)
+        if (idx_int > new_idx) {
+          new_idx = idx_int
+        }
+      }
+    }
+    new_idx = new_idx + 1
+    run_path = file.path(path, paste0(prefix, "_", new_idx))
+  }
+
   jsonlite::write_json(
-    get_configspace(learner),
-    paste0(path, "/configspace.json"),
+    get_configspace(instance),
+    paste0(run_path, "/configspace.json"),
     auto_unbox = TRUE, pretty = TRUE, null = "null"
   )
   
   jsonlite::write_json(
-    get_configs(learner),
-    paste0(path, "/configs.json"),
+    get_configs(instance),
+    paste0(run_path, "/configs.json"),
     auto_unbox = TRUE, pretty = TRUE, null = "null"
   )
 
   jsonlite::write_json(
-    get_history(learner),
-    paste0(path, "/history.json"),
+    get_history(instance),
+    paste0(run_path, "/history.json"),
     auto_unbox = TRUE, pretty = TRUE, null = "null"
   )
 
   jsonlite::write_json(
-    get_meta(learner),
-    paste0(path, "/meta.json"),
+    get_meta(instance),
+    paste0(run_path, "/meta.json"),
     auto_unbox = TRUE, pretty = TRUE, null = "null"
   )
   
-  # origins.json
-  origins = rep(list(NULL), learner$instance$archive$n_evals)
-  names(origins) = seq(learner$instance$archive$n_evals) - 1
+  # create `origins.json` (a list of `null`s)
+  origins = rep(list(NULL), instance$instance$archive$n_evals)
+  names(origins) = seq(instance$instance$archive$n_evals) - 1
   jsonlite::write_json(
     origins,
-    paste0(path, "/origins.json"),
+    paste0(run_path, "/origins.json"),
     pretty = TRUE, null = "null"
   )
 }
 
-get_configspace = function(learner) {
-  n_params = nrow(learner$instance$search_space$data)
+
+# Prepare the list for converting to `configs.json`
+get_configs = function(learner){
+  list(TBD = "TBD")
+}
+
+
+# Prepare the list for converting to `configspace.json`
+get_configspace = function(instance) {
+  n_params = nrow(instance$search_space$data)
 
   hyperparameters_list = lapply(seq_len(n_params), function(i) {
     row = search_space$data[i, ]
@@ -48,7 +99,7 @@ get_configspace = function(learner) {
     # categorical params
     if (type == "categorical") {
       choices = unlist(row[["levels"]])
-      # TBD: default
+      # FIXME: the entry `default` is missing
       return(list(
         name = name,
         type = type,
@@ -64,7 +115,7 @@ get_configspace = function(learner) {
       lower = exp(lower)
       upper = exp(upper)
     }
-    # TBD: default
+    # FIXME: the entry `default` entry is missing
     return(list(
       name = name,
       type = type,
@@ -78,8 +129,9 @@ get_configspace = function(learner) {
     child = row[["id"]]
     parent = row[["on"]]
     
-    # `cond` (below) is a list of `Condition`s. Currently, there are only 'CondEqual' and 'CondAnyOf',
-    # which should not be used simultaneously. So this list should always contain only one entry.
+    # `cond` below is a list of `Condition`s.
+    # Currently, there are only 'CondEqual' and 'CondAnyOf', which should not be used simultaneously.
+    # So this list should always contain only one entry.
     cond = row[["cond"]][[1]]
     if (is(cond, "CondEqual")) {
         return(list(child = child, parent = parent, type = "EQ", value = cond$rhs))
@@ -94,14 +146,12 @@ get_configspace = function(learner) {
   ))
 }
 
-get_configs = function(learner){
-  list(TBD = "TBD")
-}
 
 get_history = function(learner){
   list(TBD = "TBD")
 }
 
+
 get_meta = function(learner){
   list(TBD = "TBD")
 }

From 71e7de3324da222107a8f2e33e4e467332822ac4 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Sun, 11 Aug 2024 00:10:08 +0200
Subject: [PATCH 11/34] feat: configs.json

---
 R/save_deepcave_run.R                   | 45 +++++++++++++++++------
 tests/testthat/test_save_deepcave_run.R | 48 +++++++++++++++++++++----
 2 files changed, 76 insertions(+), 17 deletions(-)

diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R
index 8d291fa..7c0f2fc 100644
--- a/R/save_deepcave_run.R
+++ b/R/save_deepcave_run.R
@@ -18,7 +18,7 @@
 #' @export
 save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", overwrite = FALSE) {
   # don't save untuned instance
-  if (!length(instance$result_learner_param_vals)) {
+  if (is.null(instance$result_learner_param_vals)) {
     warning("No run is saved, because no tuning has been completed.")
     return()
   }
@@ -40,6 +40,14 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run",
     }
     new_idx = new_idx + 1
     run_path = file.path(path, paste0(prefix, "_", new_idx))
+    dir.create(run_path)
+  } else {
+    run_path = file.path(path, prefix)
+    if (file.exists(run_path)) {
+      lapply(list.files(run_path, full.names = TRUE), file.remove)
+    } else{
+      dir.create(run_path)
+    }
   }
 
   jsonlite::write_json(
@@ -67,8 +75,8 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run",
   )
   
   # create `origins.json` (a list of `null`s)
-  origins = rep(list(NULL), instance$instance$archive$n_evals)
-  names(origins) = seq(instance$instance$archive$n_evals) - 1
+  origins = rep(list(NULL), instance$archive$n_evals)
+  names(origins) = seq(instance$archive$n_evals) - 1
   jsonlite::write_json(
     origins,
     paste0(run_path, "/origins.json"),
@@ -77,9 +85,24 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run",
 }
 
 
-# Prepare the list for converting to `configs.json`
-get_configs = function(learner){
-  list(TBD = "TBD")
+# Prepare the lists for converting to `configs.json`
+get_configs = function(instance){
+  param_ids = instance$search_space$data[, id]
+
+  configs_list = map(seq_len(instance$archive$n_evals), function(i) {
+    row = as.list(instance$archive$data[i, ])
+    tuned_params = grep(paste0("^", row[["branch.selection"]]), param_ids, value = TRUE)
+    walk(tuned_params, function(param) {
+      if (instance$search_space$is_logscale[[param]]) {
+        row[[param]] = exp(row[[param]])
+      }
+    })
+    return(row[c("branch.selection", tuned_params)])
+  })
+  names(configs_list) = seq_along(configs_list) - 1
+  jsonlite::toJSON(configs_list, auto_unbox = TRUE, null = "null", na = "null", pretty = TRUE)
+
+  return(configs_list)
 }
 
 
@@ -88,7 +111,7 @@ get_configspace = function(instance) {
   n_params = nrow(instance$search_space$data)
 
   hyperparameters_list = lapply(seq_len(n_params), function(i) {
-    row = search_space$data[i, ]
+    row = instance$search_space$data[i, ]
     name = row[["id"]]
     type = switch(row[["class"]],
       ParamFct = "categorical",
@@ -108,7 +131,7 @@ get_configspace = function(instance) {
     }
 
     # int / float params
-    is_logscale = search_space$is_logscale[[name]]
+    is_logscale = instance$search_space$is_logscale[[name]]
     lower = row[["lower"]]
     upper = row[["upper"]]
     if (is_logscale) {
@@ -125,7 +148,7 @@ get_configspace = function(instance) {
   })
 
   conditions_list = lapply(seq_len(n_params), function(i) {
-    row = search_space$deps[i, ]
+    row = instance$search_space$deps[i, ]
     child = row[["id"]]
     parent = row[["on"]]
     
@@ -147,11 +170,11 @@ get_configspace = function(instance) {
 }
 
 
-get_history = function(learner){
+get_history = function(instance){
   list(TBD = "TBD")
 }
 
 
-get_meta = function(learner){
+get_meta = function(instance){
   list(TBD = "TBD")
 }
diff --git a/tests/testthat/test_save_deepcave_run.R b/tests/testthat/test_save_deepcave_run.R
index 05befdf..d42e9aa 100644
--- a/tests/testthat/test_save_deepcave_run.R
+++ b/tests/testthat/test_save_deepcave_run.R
@@ -13,12 +13,48 @@ test_that("run is saved", {
   learner$train(task)
 
   dir = tempdir()
+  expected_path = file.path(dir, "test-1-run_1")
+  if (file.exists(expected_path)) {
+    lapply(list.files(expected_path, full.names = TRUE), file.remove)
+  }
+  file.remove(expected_path)
 
-  save_deepcave_run(learner, path = paste0(dir))
+  save_deepcave_run(learner$instance, path = dir, prefix = "test-1-run", overwrite = FALSE)
 
-  expect_file_exists(paste0(dir, "/configspace.json"))
-  expect_file_exists(paste0(dir, "/configs.json"))
-  expect_file_exists(paste0(dir, "/history.json"))
-  expect_file_exists(paste0(dir, "/meta.json"))
-  expect_file_exists(paste0(dir, "/origins.json"))
+  expect_file_exists(file.path(expected_path, "configspace.json"))
+  expect_file_exists(file.path(expected_path, "configs.json"))
+  # expect_file_exists(file.path(expected_path, "history.jsonl"))
+  expect_file_exists(file.path(expected_path, "meta.json"))
+  expect_file_exists(file.path(expected_path, "origins.json"))
+})
+
+test_that("overwriting works", {
+  dir = tempdir()
+  expected_path = file.path(dir, "test-run-overwrite")
+  file.create(file.path(expected_path, "configs.json"), showWarnings = FALSE)
+  file.create(file.path(expected_path, "configspace.json"), showWarnings = FALSE)
+  file.create(file.path(expected_path, "history.jsonl"), showWarnings = FALSE)
+  file.create(file.path(expected_path, "meta.json"), showWarnings = FALSE)
+  file.create(file.path(expected_path, "origins.json"), showWarnings = FALSE)
+
+  rush_plan(n_workers = 2)
+  skip_if_not_installed("e1071")
+
+  task = tsk("penguins")
+  learner = lrn("classif.auto",
+    learner_ids = "svm",
+    small_data_size = 1,
+    resampling = rsmp("holdout"),
+    measure = msr("classif.ce"),
+    terminator = trm("evals", n_evals = 6)
+  )
+  learner$train(task)
+
+  save_deepcave_run(learner$instance, path = dir, prefix = "test-run-overwrite", overwrite = TRUE)
+
+  expect_file_exists(file.path(expected_path, "configspace.json"))
+  expect_file_exists(file.path(expected_path, "configs.json"))
+  # expect_file_exists(file.path(expected_path, "history.jsonl"))
+  expect_file_exists(file.path(expected_path, "meta.json"))
+  expect_file_exists(file.path(expected_path, "origins.json"))
 })

From 26706d73ebac4bf8efe784787acee10136de8d17 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Sun, 11 Aug 2024 15:21:45 +0200
Subject: [PATCH 12/34] feat: history.jsonl

---
 R/save_deepcave_run.R | 45 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R
index 7c0f2fc..864c39d 100644
--- a/R/save_deepcave_run.R
+++ b/R/save_deepcave_run.R
@@ -62,17 +62,46 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run",
     auto_unbox = TRUE, pretty = TRUE, null = "null"
   )
 
-  jsonlite::write_json(
-    get_history(instance),
-    paste0(run_path, "/history.json"),
-    auto_unbox = TRUE, pretty = TRUE, null = "null"
-  )
+  # jsonlite::write_json(
+  #   get_history(instance),
+  #   paste0(run_path, "/history.json"),
+  #   auto_unbox = TRUE, pretty = TRUE, null = "null"
+  # )
 
   jsonlite::write_json(
     get_meta(instance),
     paste0(run_path, "/meta.json"),
     auto_unbox = TRUE, pretty = TRUE, null = "null"
   )
+
+  # stream out `history.jsonl`
+  n_evals = instance$archive$n_evals
+  # FIXME: make time an optional cost
+  costs = c(instance$objective$codomain$data[, id], "runtime_learners")
+  selected_cols = c(costs, "timestamp_xs", "timestamp_ys", "state")
+  history_table = instance$archive$data[, ..selected_cols][, .(
+    config_id = seq_len(n_evals) - 1,
+    budget = 0,
+    seed = -1,
+    costs = lapply(transpose(.SD), c),
+    # handle start and end time (time elapsed since first timestamp)
+    # see https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/recorder.py
+    start_time = as.numeric(timestamp_xs - timestamp_xs[1]),
+    end_time = as.numeric(timestamp_ys - timestamp_ys[1]),
+    # state is either "finished" (SUCESS = 1) or "queued" (NOT_EVALUATED = 0)
+    # see https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/status.py
+    state = ifelse(state == "finished", 1, 6),
+    additionals = list()
+  ), .SDcols = costs]
+  
+  con = file("history.jsonl", open = "w")
+  jsonlite::stream_out(
+    history_table,
+    con,
+    auto_unbox = TRUE, pretty = TRUE, null = "list", na = "null",
+    dataframe = "values"
+  )
+  close(con)
   
   # create `origins.json` (a list of `null`s)
   origins = rep(list(NULL), instance$archive$n_evals)
@@ -170,9 +199,9 @@ get_configspace = function(instance) {
 }
 
 
-get_history = function(instance){
-  list(TBD = "TBD")
-}
+# get_history = function(instance){
+#   list(TBD = "TBD")
+# }
 
 
 get_meta = function(instance){

From 836c873bf77e47eb38f0c5ecd2a04ae7233082dd Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Sun, 11 Aug 2024 15:50:46 +0200
Subject: [PATCH 13/34] fix: save jsonl

---
 R/save_deepcave_run.R                   | 20 ++++----------------
 tests/testthat/test_save_deepcave_run.R |  6 +++---
 2 files changed, 7 insertions(+), 19 deletions(-)

diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R
index 864c39d..2a7c5a2 100644
--- a/R/save_deepcave_run.R
+++ b/R/save_deepcave_run.R
@@ -28,8 +28,8 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run",
   # (https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/recorder.py)
   if (!overwrite) {
     new_idx = 0
-    for (fn in list.files(path)) {
-      if (!startsWith(fn, "prefix")) next
+    walk(list.files(path), function(fn) {
+      if (!startsWith(fn, "prefix")) return()
       idx = last(strsplit(fn, "_")[[1]])
       if (is.numeric(idx)) {
         idx_int = as.integer(idx)
@@ -37,7 +37,7 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run",
           new_idx = idx_int
         }
       }
-    }
+    })
     new_idx = new_idx + 1
     run_path = file.path(path, paste0(prefix, "_", new_idx))
     dir.create(run_path)
@@ -62,12 +62,6 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run",
     auto_unbox = TRUE, pretty = TRUE, null = "null"
   )
 
-  # jsonlite::write_json(
-  #   get_history(instance),
-  #   paste0(run_path, "/history.json"),
-  #   auto_unbox = TRUE, pretty = TRUE, null = "null"
-  # )
-
   jsonlite::write_json(
     get_meta(instance),
     paste0(run_path, "/meta.json"),
@@ -94,7 +88,7 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run",
     additionals = list()
   ), .SDcols = costs]
   
-  con = file("history.jsonl", open = "w")
+  con = file(file.path(run_path, "history.jsonl"), open = "w")
   jsonlite::stream_out(
     history_table,
     con,
@@ -198,12 +192,6 @@ get_configspace = function(instance) {
   ))
 }
 
-
-# get_history = function(instance){
-#   list(TBD = "TBD")
-# }
-
-
 get_meta = function(instance){
   list(TBD = "TBD")
 }
diff --git a/tests/testthat/test_save_deepcave_run.R b/tests/testthat/test_save_deepcave_run.R
index d42e9aa..d24cd47 100644
--- a/tests/testthat/test_save_deepcave_run.R
+++ b/tests/testthat/test_save_deepcave_run.R
@@ -16,14 +16,14 @@ test_that("run is saved", {
   expected_path = file.path(dir, "test-1-run_1")
   if (file.exists(expected_path)) {
     lapply(list.files(expected_path, full.names = TRUE), file.remove)
+    file.remove(expected_path)
   }
-  file.remove(expected_path)
 
   save_deepcave_run(learner$instance, path = dir, prefix = "test-1-run", overwrite = FALSE)
 
   expect_file_exists(file.path(expected_path, "configspace.json"))
   expect_file_exists(file.path(expected_path, "configs.json"))
-  # expect_file_exists(file.path(expected_path, "history.jsonl"))
+  expect_file_exists(file.path(expected_path, "history.jsonl"))
   expect_file_exists(file.path(expected_path, "meta.json"))
   expect_file_exists(file.path(expected_path, "origins.json"))
 })
@@ -54,7 +54,7 @@ test_that("overwriting works", {
 
   expect_file_exists(file.path(expected_path, "configspace.json"))
   expect_file_exists(file.path(expected_path, "configs.json"))
-  # expect_file_exists(file.path(expected_path, "history.jsonl"))
+  expect_file_exists(file.path(expected_path, "history.jsonl"))
   expect_file_exists(file.path(expected_path, "meta.json"))
   expect_file_exists(file.path(expected_path, "origins.json"))
 })

From 2962a041d1397736de596eda7b0ee47f0b3ff649 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Sun, 11 Aug 2024 16:59:33 +0200
Subject: [PATCH 14/34] feat: meta.json

---
 R/save_deepcave_run.R | 105 +++++++++++++++++++++++++++++-------------
 1 file changed, 74 insertions(+), 31 deletions(-)

diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R
index 2a7c5a2..c684837 100644
--- a/R/save_deepcave_run.R
+++ b/R/save_deepcave_run.R
@@ -50,54 +50,39 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run",
     }
   }
 
+
+  # `configspace.json`
   jsonlite::write_json(
     get_configspace(instance),
     paste0(run_path, "/configspace.json"),
     auto_unbox = TRUE, pretty = TRUE, null = "null"
   )
-  
+
+  # `configs.json` 
   jsonlite::write_json(
     get_configs(instance),
-    paste0(run_path, "/configs.json"),
+    file.path(run_path, "configs.json"),
     auto_unbox = TRUE, pretty = TRUE, null = "null"
   )
 
+  # `meta.json`
   jsonlite::write_json(
     get_meta(instance),
-    paste0(run_path, "/meta.json"),
+    file.path(run_path, "meta.json"),
     auto_unbox = TRUE, pretty = TRUE, null = "null"
   )
 
-  # stream out `history.jsonl`
-  n_evals = instance$archive$n_evals
-  # FIXME: make time an optional cost
-  costs = c(instance$objective$codomain$data[, id], "runtime_learners")
-  selected_cols = c(costs, "timestamp_xs", "timestamp_ys", "state")
-  history_table = instance$archive$data[, ..selected_cols][, .(
-    config_id = seq_len(n_evals) - 1,
-    budget = 0,
-    seed = -1,
-    costs = lapply(transpose(.SD), c),
-    # handle start and end time (time elapsed since first timestamp)
-    # see https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/recorder.py
-    start_time = as.numeric(timestamp_xs - timestamp_xs[1]),
-    end_time = as.numeric(timestamp_ys - timestamp_ys[1]),
-    # state is either "finished" (SUCESS = 1) or "queued" (NOT_EVALUATED = 0)
-    # see https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/status.py
-    state = ifelse(state == "finished", 1, 6),
-    additionals = list()
-  ), .SDcols = costs]
-  
+  # `history.jsonl` 
   con = file(file.path(run_path, "history.jsonl"), open = "w")
   jsonlite::stream_out(
-    history_table,
+    get_history(instance),
     con,
     auto_unbox = TRUE, pretty = TRUE, null = "list", na = "null",
     dataframe = "values"
   )
   close(con)
   
-  # create `origins.json` (a list of `null`s)
+  # `origins.json` (a list of `null`s)
   origins = rep(list(NULL), instance$archive$n_evals)
   names(origins) = seq(instance$archive$n_evals) - 1
   jsonlite::write_json(
@@ -122,8 +107,8 @@ get_configs = function(instance){
     })
     return(row[c("branch.selection", tuned_params)])
   })
+  
   names(configs_list) = seq_along(configs_list) - 1
-  jsonlite::toJSON(configs_list, auto_unbox = TRUE, null = "null", na = "null", pretty = TRUE)
 
   return(configs_list)
 }
@@ -133,7 +118,7 @@ get_configs = function(instance){
 get_configspace = function(instance) {
   n_params = nrow(instance$search_space$data)
 
-  hyperparameters_list = lapply(seq_len(n_params), function(i) {
+  hyperparameters_list = map(seq_len(n_params), function(i) {
     row = instance$search_space$data[i, ]
     name = row[["id"]]
     type = switch(row[["class"]],
@@ -150,7 +135,8 @@ get_configspace = function(instance) {
         name = name,
         type = type,
         choices = choices,
-        weights = NULL))
+        weights = NULL
+      ))
     }
 
     # int / float params
@@ -167,10 +153,11 @@ get_configspace = function(instance) {
       type = type,
       log = is_logscale,
       lower = lower,
-      upper = upper))
+      upper = upper
+    ))
   })
 
-  conditions_list = lapply(seq_len(n_params), function(i) {
+  conditions_list = map(seq_len(n_params), function(i) {
     row = instance$search_space$deps[i, ]
     child = row[["id"]]
     parent = row[["on"]]
@@ -192,6 +179,62 @@ get_configspace = function(instance) {
   ))
 }
 
+get_history = function(instance) {
+  selected_cols = c(costs, "timestamp_xs", "timestamp_ys", "state")
+  history_table = instance$archive$data[, ..selected_cols][, .(
+    config_id = seq_len(n_evals) - 1,
+    budget = 0,
+    seed = -1,
+    costs = lapply(transpose(.SD), c),
+    # handle start and end time (time elapsed since first timestamp)
+    # see https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/recorder.py
+    start_time = as.numeric(timestamp_xs - timestamp_xs[1]),
+    end_time = as.numeric(timestamp_ys - timestamp_ys[1]),
+    # state is either "finished" (SUCESS = 1) or "queued" (NOT_EVALUATED = 0)
+    # see https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/status.py
+    state = ifelse(state == "finished", 1, 6),
+    additionals = list()
+  ), .SDcols = costs]
+
+  return(history_table)
+}
+
 get_meta = function(instance){
-  list(TBD = "TBD")
+  costs = instance$objective$codomain$data[, id]
+
+  objectives_list = map(costs, function(cost) {
+    measure = msr(cost)
+
+    lower = measure$range[[1]]
+    if (is.finite(lower)) {
+      lock_lower = TRUE
+    } else {
+      lower = min(instance$archive$data[, ..cost])
+      lock_lower = FALSE
+    }
+
+    upper = measure$range[[2]]
+    if (is.finite(upper)) {
+      lock_upper = TRUE
+    } else {
+      upper = max(instance$archive$data[, ..cost])
+      lock_upper = FALSE
+    }
+
+    optimize = if (measure$minimize) {
+      "lower"
+    } else {
+      "upper"
+    }
+
+    return(list(name = cost, lower = lower, upper = upper,
+      lock_lower = lock_lower, lock_upper = lock_upper, optimize = optimize))
+
+  })  
+  
+  return(list(
+    objectives = objectives_list,
+    budgets = rep(list(0), instance$archive$n_evals),
+    seeds = list(-1)
+  ))
 }

From 02647285bfce98e36015bdef182f10a590f7e590 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Sun, 11 Aug 2024 17:01:29 +0200
Subject: [PATCH 15/34] fix: jsonl verbose

---
 R/save_deepcave_run.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R
index c684837..870dd70 100644
--- a/R/save_deepcave_run.R
+++ b/R/save_deepcave_run.R
@@ -78,7 +78,8 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run",
     get_history(instance),
     con,
     auto_unbox = TRUE, pretty = TRUE, null = "list", na = "null",
-    dataframe = "values"
+    dataframe = "values",
+    verbose = FALSE
   )
   close(con)
   

From 2e8ec9472f939cacdcab36e322595bbaf03d5b34 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Sun, 11 Aug 2024 19:00:25 +0200
Subject: [PATCH 16/34] fix: conditions

---
 R/save_deepcave_run.R | 49 ++++++++++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R
index 870dd70..c6b5f0b 100644
--- a/R/save_deepcave_run.R
+++ b/R/save_deepcave_run.R
@@ -84,8 +84,8 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run",
   close(con)
   
   # `origins.json` (a list of `null`s)
-  origins = rep(list(NULL), instance$archive$n_evals)
-  names(origins) = seq(instance$archive$n_evals) - 1
+  origins = rep(list(NULL), nrow(instance$archive$data))
+  names(origins) = seq(nrow(instance$archive$data)) - 1
   jsonlite::write_json(
     origins,
     paste0(run_path, "/origins.json"),
@@ -98,15 +98,15 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run",
 get_configs = function(instance){
   param_ids = instance$search_space$data[, id]
 
-  configs_list = map(seq_len(instance$archive$n_evals), function(i) {
-    row = as.list(instance$archive$data[i, ])
-    tuned_params = grep(paste0("^", row[["branch.selection"]]), param_ids, value = TRUE)
+  configs_list = map(seq_len(nrow(instance$archive$data)), function(i) {
+    config = as.list(instance$archive$data[i, ])
+    tuned_params = grep(paste0("^", config[["branch.selection"]]), param_ids, value = TRUE)
     walk(tuned_params, function(param) {
       if (instance$search_space$is_logscale[[param]]) {
-        row[[param]] = exp(row[[param]])
+        config[[param]] = exp(config[[param]])
       }
     })
-    return(row[c("branch.selection", tuned_params)])
+    return(discard(config[c("branch.selection", tuned_params)], is.na))
   })
   
   names(configs_list) = seq_along(configs_list) - 1
@@ -118,6 +118,7 @@ get_configs = function(instance){
 # Prepare the list for converting to `configspace.json`
 get_configspace = function(instance) {
   n_params = nrow(instance$search_space$data)
+  param_ids = instance$search_space$data[, id]
 
   hyperparameters_list = map(seq_len(n_params), function(i) {
     row = instance$search_space$data[i, ]
@@ -131,12 +132,13 @@ get_configspace = function(instance) {
     # categorical params
     if (type == "categorical") {
       choices = unlist(row[["levels"]])
-      # FIXME: the entry `default` is missing
+      # FIXME: `default` is wrong
       return(list(
         name = name,
         type = type,
         choices = choices,
-        weights = NULL
+        default = choices[[1]],
+        probabilisties = NULL
       ))
     }
 
@@ -144,33 +146,40 @@ get_configspace = function(instance) {
     is_logscale = instance$search_space$is_logscale[[name]]
     lower = row[["lower"]]
     upper = row[["upper"]]
+    default = mean(lower, upper)
     if (is_logscale) {
       lower = exp(lower)
       upper = exp(upper)
+      default = exp(default)
     }
-    # FIXME: the entry `default` entry is missing
+    # FIXME: default is wrong
     return(list(
       name = name,
       type = type,
       log = is_logscale,
       lower = lower,
-      upper = upper
+      upper = upper,
+      default = default,
+      q = NULL
     ))
   })
 
-  conditions_list = map(seq_len(n_params), function(i) {
-    row = instance$search_space$deps[i, ]
-    child = row[["id"]]
-    parent = row[["on"]]
+  conditions_list = map(setdiff(param_ids, "branch.selection"), function(param_id) {
+    dependency = instance$search_space$deps[id == param_id, ]
+    if (nrow(dependency) > 1) {
+      dependency = dependency[on != "branch.selection", ]
+    }
+    child = param_id
+    parent = dependency[["on"]]
     
     # `cond` below is a list of `Condition`s.
     # Currently, there are only 'CondEqual' and 'CondAnyOf', which should not be used simultaneously.
     # So this list should always contain only one entry.
-    cond = row[["cond"]][[1]]
+    cond = dependency[["cond"]][[1]]
     if (is(cond, "CondEqual")) {
         return(list(child = child, parent = parent, type = "EQ", value = cond$rhs))
-      }
-      return(list(child = child, parent = parent, type = "IN", values = cond$rhs))
+    }
+    return(list(child = child, parent = parent, type = "IN", values = cond$rhs))
   })
 
   return(list(
@@ -181,9 +190,11 @@ get_configspace = function(instance) {
 }
 
 get_history = function(instance) {
+  costs = instance$objective$codomain$data[, id]
+
   selected_cols = c(costs, "timestamp_xs", "timestamp_ys", "state")
   history_table = instance$archive$data[, ..selected_cols][, .(
-    config_id = seq_len(n_evals) - 1,
+    config_id = seq_len(nrow(instance$archive$data)) - 1,
     budget = 0,
     seed = -1,
     costs = lapply(transpose(.SD), c),

From 04e4ee7c7734f6054c3cf3635d4d7926d51ad078 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Sun, 11 Aug 2024 20:26:27 +0200
Subject: [PATCH 17/34] fix: configs

---
 R/save_deepcave_run.R | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R
index c6b5f0b..4480bd8 100644
--- a/R/save_deepcave_run.R
+++ b/R/save_deepcave_run.R
@@ -97,18 +97,13 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run",
 # Prepare the lists for converting to `configs.json`
 get_configs = function(instance){
   param_ids = instance$search_space$data[, id]
+  logscale_params = param_ids[instance$search_space$is_logscale[param_ids]]
+  config_table = instance$archive$data[, param_ids, with = FALSE]
+  config_table[, (logscale_params) := lapply(.SD, exp), .SDcols = logscale_params]
 
-  configs_list = map(seq_len(nrow(instance$archive$data)), function(i) {
-    config = as.list(instance$archive$data[i, ])
-    tuned_params = grep(paste0("^", config[["branch.selection"]]), param_ids, value = TRUE)
-    walk(tuned_params, function(param) {
-      if (instance$search_space$is_logscale[[param]]) {
-        config[[param]] = exp(config[[param]])
-      }
-    })
-    return(discard(config[c("branch.selection", tuned_params)], is.na))
-  })
-  
+  configs_list = map(seq_len(nrow(config_table)), function(i) {
+    discard(as.list(config_table[i, ]), is.na)
+  })  
   names(configs_list) = seq_along(configs_list) - 1
 
   return(configs_list)
@@ -190,7 +185,7 @@ get_configspace = function(instance) {
 }
 
 get_history = function(instance) {
-  costs = instance$objective$codomain$data[, id]
+  costs = c(instance$objective$codomain$data[, id], "runtime_learners")
 
   selected_cols = c(costs, "timestamp_xs", "timestamp_ys", "state")
   history_table = instance$archive$data[, ..selected_cols][, .(
@@ -202,9 +197,9 @@ get_history = function(instance) {
     # see https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/recorder.py
     start_time = as.numeric(timestamp_xs - timestamp_xs[1]),
     end_time = as.numeric(timestamp_ys - timestamp_ys[1]),
-    # state is either "finished" (SUCESS = 1) or "queued" (NOT_EVALUATED = 0)
+    # state is either "finished" <=> SUCESS = 1 or ABORTED = 0
     # see https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/status.py
-    state = ifelse(state == "finished", 1, 6),
+    state = ifelse(state == "finished", 1, 5),
     additionals = list()
   ), .SDcols = costs]
 

From b1c7d1e779de82c90bf0ed6a4997f906b3a03089 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Sun, 11 Aug 2024 20:26:52 +0200
Subject: [PATCH 18/34] fix: budget

---
 R/save_deepcave_run.R | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R
index 4480bd8..57dfb40 100644
--- a/R/save_deepcave_run.R
+++ b/R/save_deepcave_run.R
@@ -236,12 +236,20 @@ get_meta = function(instance){
 
     return(list(name = cost, lower = lower, upper = upper,
       lock_lower = lock_lower, lock_upper = lock_upper, optimize = optimize))
-
   })  
   
+  objectives_list = c(objectives_list, list(list(
+    name = "time",
+    lower = 0,
+    upper = max(instance$archive$data[, runtime_learners]),
+    lock_lower = TRUE,
+    lock_upper = FALSE,
+    optimize = "lower"
+  )))
+  
   return(list(
     objectives = objectives_list,
-    budgets = rep(list(0), instance$archive$n_evals),
+    budgets = list(0),
     seeds = list(-1)
   ))
 }

From 15722db53c0a28cba06f1b786f1524e4dad28310 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Sun, 11 Aug 2024 20:54:27 +0200
Subject: [PATCH 19/34] refactor: comments

---
 R/save_deepcave_run.R | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R
index 57dfb40..cbba48f 100644
--- a/R/save_deepcave_run.R
+++ b/R/save_deepcave_run.R
@@ -94,7 +94,7 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run",
 }
 
 
-# Prepare the lists for converting to `configs.json`
+# Prepare the list for converting to `configs.json`
 get_configs = function(instance){
   param_ids = instance$search_space$data[, id]
   logscale_params = param_ids[instance$search_space$is_logscale[param_ids]]
@@ -115,9 +115,9 @@ get_configspace = function(instance) {
   n_params = nrow(instance$search_space$data)
   param_ids = instance$search_space$data[, id]
 
-  hyperparameters_list = map(seq_len(n_params), function(i) {
-    row = instance$search_space$data[i, ]
-    name = row[["id"]]
+  hyperparameters_list = map(param_ids, function(param_id) {
+    row = instance$search_space$data[id == param_id, ]
+
     type = switch(row[["class"]],
       ParamFct = "categorical",
       ParamLgl = "categorical",
@@ -127,29 +127,29 @@ get_configspace = function(instance) {
     # categorical params
     if (type == "categorical") {
       choices = unlist(row[["levels"]])
-      # FIXME: `default` is wrong
       return(list(
-        name = name,
+        name = param_id,
         type = type,
         choices = choices,
+        # FIXME: `default` is wrong
         default = choices[[1]],
         probabilisties = NULL
       ))
     }
 
     # int / float params
-    is_logscale = instance$search_space$is_logscale[[name]]
+    is_logscale = instance$search_space$is_logscale[[param_id]]
     lower = row[["lower"]]
     upper = row[["upper"]]
+    # FIXME: default is wrong
     default = mean(lower, upper)
     if (is_logscale) {
       lower = exp(lower)
       upper = exp(upper)
       default = exp(default)
     }
-    # FIXME: default is wrong
     return(list(
-      name = name,
+      name = param_id,
       type = type,
       log = is_logscale,
       lower = lower,
@@ -161,6 +161,9 @@ get_configspace = function(instance) {
 
   conditions_list = map(setdiff(param_ids, "branch.selection"), function(param_id) {
     dependency = instance$search_space$deps[id == param_id, ]
+    # `svm.degree` and `svm.gamma` depends on `svm.kernel` as well as `branch.selection`.
+    # DeepCAVE does not allow one parameter to be conditioned on multiple others.
+    # So remove their dependency on `branch.selection`.
     if (nrow(dependency) > 1) {
       dependency = dependency[on != "branch.selection", ]
     }
@@ -184,6 +187,7 @@ get_configspace = function(instance) {
   ))
 }
 
+# Prepare the data.table for converting to `history.jsonl`
 get_history = function(instance) {
   costs = c(instance$objective$codomain$data[, id], "runtime_learners")
 
@@ -192,12 +196,14 @@ get_history = function(instance) {
     config_id = seq_len(nrow(instance$archive$data)) - 1,
     budget = 0,
     seed = -1,
+    # combine costs into a list column
     costs = lapply(transpose(.SD), c),
     # handle start and end time (time elapsed since first timestamp)
     # see https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/recorder.py
+    # start and end time here might having different meanings than the original implementation
     start_time = as.numeric(timestamp_xs - timestamp_xs[1]),
     end_time = as.numeric(timestamp_ys - timestamp_ys[1]),
-    # state is either "finished" <=> SUCESS = 1 or ABORTED = 0
+    # state is either "finished" <=> SUCESS = 1 or ABORTED = 5
     # see https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/status.py
     state = ifelse(state == "finished", 1, 5),
     additionals = list()
@@ -206,7 +212,10 @@ get_history = function(instance) {
   return(history_table)
 }
 
+
+# Prepare the list for converting to 'meta.json'
 get_meta = function(instance){
+  # time is handled separately below
   costs = instance$objective$codomain$data[, id]
 
   objectives_list = map(costs, function(cost) {
@@ -236,8 +245,8 @@ get_meta = function(instance){
 
     return(list(name = cost, lower = lower, upper = upper,
       lock_lower = lock_lower, lock_upper = lock_upper, optimize = optimize))
-  })  
-  
+  })
+
   objectives_list = c(objectives_list, list(list(
     name = "time",
     lower = 0,

From 7412811e5a08bb59a5b563187850ec2931a96c96 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Sun, 11 Aug 2024 20:55:26 +0200
Subject: [PATCH 20/34] chore: gitignore local testing files

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 5387b99..626665b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -97,3 +97,4 @@ rsconnect/
 /attic/
 .Rprofile
 kaggle/
+deepcave
\ No newline at end of file

From a0e36e30e4f51aa27e3a6d87c69a8f41baa8bb2f Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Tue, 13 Aug 2024 14:28:16 +0200
Subject: [PATCH 21/34] revert: remove regr related stuff

This reverts commit 953940b, 90c4757, 2927164, 823bced, 4bb4d0c, 2c99b98, 30de063, 766bccd
---
 DESCRIPTION                              |   3 -
 R/LearnerClassifAuto.R                   | 343 ++++++++++++++++++++++-
 R/LearnerRegrAuto.R                      |  66 +----
 R/build_graph.R                          | 202 -------------
 R/train_auto.R                           | 155 ----------
 tests/testthat/test_LearnerClassifAuto.R |   2 +-
 tests/testthat/test_LearnerRegrAuto.R    | 296 -------------------
 7 files changed, 346 insertions(+), 721 deletions(-)
 delete mode 100644 R/build_graph.R
 delete mode 100644 R/train_auto.R
 delete mode 100644 tests/testthat/test_LearnerRegrAuto.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 855771c..938ca9c 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -63,7 +63,4 @@ Collate:
     'LearnerClassifAutoXgboost.R'
     'LearnerRegrAuto.R'
     'helper.R'
-    'build_graph.R'
-    'save_deepcave_run.R'
-    'train_auto.R'
     'zzz.R'
diff --git a/R/LearnerClassifAuto.R b/R/LearnerClassifAuto.R
index 1d740b0..b3b126e 100644
--- a/R/LearnerClassifAuto.R
+++ b/R/LearnerClassifAuto.R
@@ -87,7 +87,159 @@ LearnerClassifAuto = R6Class("LearnerClassifAuto",
 
   private = list(
    .train = function(task) {
-      train_auto(self, task, task_type = "classif")
+      pv = self$param_set$values
+      learner_ids = pv$learner_ids
+      self$graph = build_graph(learner_ids)
+      self$tuning_space = tuning_space[learner_ids]
+
+      lg$debug("Training '%s' on task '%s'", self$id, task$id)
+
+      # initialize mbo tuner
+      tuner = tnr("adbo")
+
+      # remove learner based on memory limit
+      lg$debug("Starting to select from %i learners: %s", length(learner_ids), paste0(learner_ids, collapse = ","))
+
+      if (!is.null(pv$max_memory)) {
+        memory_usage = map_dbl(learner_ids, function(learner_id) {
+          self$graph$pipeops[[learner_id]]$learner$estimate_memory_usage(task) / 1e6
+        })
+        learner_ids = learner_ids[memory_usage < pv$max_memory]
+        lg$debug("Checking learners for memory limit of %i MB. Keeping %i learner(s): %s", pv$max_memory, length(learner_ids), paste0(learner_ids, collapse = ","))
+      }
+
+      # set number of threads
+      if (!is.null(pv$max_nthread)) {
+        lg$debug("Setting number of threads per learner to %i", pv$max_nthread)
+        walk(learner_ids, function(learner_id) {
+          set_threads(self$graph$pipeops[[learner_id]]$learner, pv$max_nthread)
+        })
+      }
+
+      # reduce number of workers on large data sets
+      if (!is.null(pv$large_data_size) && task$nrow * task$ncol > pv$large_data_size) {
+        lg$debug("Task size larger than %i rows", pv$large_data_size)
+
+        learner_ids = intersect(learner_ids, pv$large_data_learner_ids)
+        self$tuning_space = tuning_space[learner_ids]
+        lg$debug("Keeping %i learner(s): %s", length(learner_ids), paste0(learner_ids, collapse = ","))
+
+        lg$debug("Increasing number of threads per learner to %i", pv$large_data_nthread)
+        walk(learner_ids, function(learner_id) {
+          set_threads(self$graph$pipeops[[learner_id]]$learner, pv$large_data_nthread)
+        })
+        n_workers = rush_config()$n_workers
+        n = max(1, floor(n_workers / pv$large_data_nthread))
+        tuner$param_set$set_values(n_workers = n)
+        lg$debug("Reducing number of workers to %i", n)
+      }
+
+      # small data resampling
+      resampling = if (!is.null(pv$small_data_size) && task$nrow < pv$small_data_size) {
+        lg$debug("Task has less than %i rows", pv$small_data_size)
+        lg$debug("Using small data set resampling with %i iterations", pv$small_data_resampling$iters)
+        pv$small_data_resampling
+      } else {
+        pv$resampling
+      }
+
+      # cardinality
+      cardinality = map_int(task$col_info$levels, length)
+      if (!is.null(pv$max_cardinality) && any(cardinality > pv$max_cardinality)) {
+        lg$debug("Reducing number of factor levels to %i", pv$max_cardinality)
+
+        # collapse factors
+        pipeop_ids = names(self$graph$pipeops)
+        pipeop_ids = pipeop_ids[grep("collapse", pipeop_ids)]
+        walk(pipeop_ids, function(pipeop_id) {
+          self$graph$pipeops[[pipeop_id]]$param_set$values$target_level_count = pv$max_cardinality
+        })
+      }
+
+      if ("extra_trees" %in% learner_ids && any(cardinality > pv$extra_trees_max_cardinality))  {
+        lg$debug("Reducing number of factor levels to %i for extra trees", pv$extra_trees_max_cardinality)
+        self$graph$pipeops$extra_trees_collapse$param_set$values$target_level_count = pv$extra_trees_max_cardinality
+      }
+
+      # initialize graph learner
+      graph_learner = as_learner(self$graph)
+      graph_learner$id = "graph_learner"
+      graph_learner$predict_type = pv$measure$predict_type
+      graph_learner$fallback = lrn("classif.featureless", predict_type = pv$measure$predict_type)
+      graph_learner$encapsulate = c(train = "callr", predict = "callr")
+      graph_learner$timeout = c(train = pv$learner_timeout, predict = pv$learner_timeout)
+
+      learners_with_validation = intersect(learner_ids, c("xgboost", "catboost", "lightgbm"))
+      if (length(learners_with_validation)) {
+        set_validate(graph_learner, "test", ids = learners_with_validation)
+      }
+
+      # set early stopping
+      if ("xgboost" %in% learner_ids) {
+        graph_learner$param_set$values$xgboost.callbacks = list(cb_timeout_xgboost(pv$learner_timeout * 0.8))
+        graph_learner$param_set$values$xgboost.eval_metric = pv$xgboost_eval_metric
+      }
+      if ("catboost" %in% learner_ids) {
+        graph_learner$param_set$values$catboost.eval_metric = pv$catboost_eval_metric
+      }
+      if ("lightgbm" %in% learner_ids) {
+        graph_learner$param_set$values$lightgbm.callbacks = list(cb_timeout_lightgbm(pv$learner_timeout * 0.8))
+        graph_learner$param_set$values$lightgbm.eval = pv$lightgbm_eval_metric
+      }
+
+      # initialize search space
+      tuning_space = unlist(unname(self$tuning_space), recursive = FALSE)
+      graph_scratch = graph_learner$clone(deep = TRUE)
+      graph_scratch$param_set$set_values(.values = tuning_space)
+      graph_scratch$param_set$set_values(branch.selection = to_tune(learner_ids))
+      search_space = graph_scratch$param_set$search_space()
+      walk(learner_ids, function(learner_id) {
+        param_ids = search_space$ids()
+        param_ids = grep(paste0("^", learner_id), param_ids, value = TRUE)
+        walk(param_ids, function(param_id) {
+          # skip internal tuning parameter
+          if (param_id %in% c("xgboost.nrounds", "catboost.iterations", "lightgbm.num_iterations")) return()
+          search_space$add_dep(
+            id = param_id,
+            on = "branch.selection",
+            cond = CondEqual$new(learner_id)
+          )
+        })
+      })
+
+      # initial design
+      lhs_xdt = generate_lhs_design(pv$lhs_size, self$task_type, setdiff(learner_ids, c("lda", "extra_trees")), self$tuning_space)
+      default_xdt = generate_default_design(self$task_type, learner_ids, task, self$tuning_space)
+      initial_xdt = rbindlist(list(lhs_xdt, default_xdt), use.names = TRUE, fill = TRUE)
+      setorderv(initial_xdt, "branch.selection")
+      tuner$param_set$set_values(initial_design = initial_xdt)
+
+      # initialize auto tuner
+      self$instance = ti_async(
+        task = task,
+        learner = graph_learner,
+        resampling = resampling,
+        measures = pv$measure,
+        terminator = pv$terminator,
+        search_space = search_space,
+        callbacks = pv$callbacks,
+        store_benchmark_result = pv$store_benchmark_result
+      )
+
+      # tune
+      lg$debug("Learner '%s' starts tuning phase", self$id)
+      tuner$optimize(self$instance)
+
+      # fit final model
+      lg$debug("Learner '%s' fits final model", self$id)
+      if (length(learners_with_validation)) {
+        set_validate(graph_learner, NULL, ids = intersect(learner_ids, c("xgboost", "catboost", "lightgbm")))
+      }
+      graph_learner$param_set$set_values(.values = self$instance$result_learner_param_vals, .insert = FALSE)
+      graph_learner$timeout = c(train = Inf, predict = Inf)
+      graph_learner$train(task)
+
+      list(graph_learner = graph_learner, instance = self$instance)
     },
 
     .predict = function(task) {
@@ -99,3 +251,192 @@ LearnerClassifAuto = R6Class("LearnerClassifAuto",
 
 #' @include aaa.R
 learners[["classif.auto"]] = LearnerClassifAuto
+
+build_graph = function(learner_ids) {
+  branches = list()
+  # glmnet
+  if ("glmnet" %in% learner_ids) {
+    branch_glmnet = po("removeconstants", id = "glmnet_removeconstants") %>>%
+      po("imputehist", id = "glmnet_imputehist") %>>%
+      po("imputeoor", id = "glmnet_imputeoor") %>>%
+      po("fixfactors", id = "glmnet_fixfactors") %>>%
+      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "glmnet_imputesample") %>>%
+      po("collapsefactors", target_level_count = 100, id = "glmnet_collapse") %>>%
+      po("encode", method = "one-hot", id = "glmnet_encode") %>>%
+      po("removeconstants", id = "glmnet_post_removeconstants") %>>%
+      lrn("classif.glmnet", id = "glmnet")
+    branches = c(branches, branch_glmnet)
+  }
+
+  # kknn
+  if ("kknn" %in% learner_ids) {
+    branch_kknn = po("removeconstants", id = "kknn_removeconstants") %>>%
+      po("imputehist", id = "kknn_imputehist") %>>%
+      po("imputeoor", id = "kknn_imputeoor") %>>%
+      po("fixfactors", id = "kknn_fixfactors") %>>%
+      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "kknn_imputesample") %>>%
+      po("collapsefactors", target_level_count = 100, id = "kknn_collapse") %>>%
+      po("removeconstants", id = "kknn_post_removeconstants") %>>%
+      lrn("classif.kknn", id = "kknn")
+    branches = c(branches, branch_kknn)
+  }
+
+  # lda
+  if ("lda" %in% learner_ids) {
+    branch_lda = po("removeconstants", id = "lda_removeconstants") %>>%
+      po("imputehist", id = "lda_imputehist") %>>%
+      po("imputeoor", id = "lda_imputeoor") %>>%
+      po("fixfactors", id = "lda_fixfactors") %>>%
+      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "lda_imputesample") %>>%
+      po("collapsefactors", target_level_count = 100, id = "lda_collapse") %>>%
+      po("removeconstants", id = "lda_post_removeconstants") %>>%
+      lrn("classif.lda", id = "lda")
+    branches = c(branches, branch_lda)
+  }
+
+  # nnet
+  if ("nnet" %in% learner_ids) {
+    branch_nnet = po("removeconstants", id = "nnet_removeconstants") %>>%
+      po("imputehist", id = "nnet_imputehist") %>>%
+      po("imputeoor", id = "nnet_imputeoor") %>>%
+      po("fixfactors", id = "nnet_fixfactors") %>>%
+      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "nnet_imputesample") %>>%
+      po("collapsefactors", target_level_count = 100, id = "nnet_collapse") %>>%
+      po("removeconstants", id = "nnet_post_removeconstants") %>>%
+      lrn("classif.nnet", id = "nnet")
+    branches = c(branches, branch_nnet)
+  }
+
+  # ranger
+  if ("ranger" %in% learner_ids) {
+    branch_ranger = po("removeconstants", id = "ranger_removeconstants") %>>%
+      po("imputeoor", id = "ranger_imputeoor") %>>%
+      po("fixfactors", id = "ranger_fixfactors") %>>%
+      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "ranger_imputesample") %>>%
+      po("collapsefactors", target_level_count = 100, id = "ranger_collapse") %>>%
+      po("removeconstants", id = "ranger_post_removeconstants") %>>%
+      # use upper bound of search space for memory estimation
+      lrn("classif.ranger", id = "ranger", num.trees = 2000)
+    branches = c(branches, branch_ranger)
+  }
+
+  # svm
+  if ("svm" %in% learner_ids) {
+    branch_svm = po("removeconstants", id = "svm_removeconstants") %>>%
+      po("imputehist", id = "svm_imputehist") %>>%
+      po("imputeoor", id = "svm_imputeoor") %>>%
+      po("fixfactors", id = "svm_fixfactors") %>>%
+      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "svm_imputesample") %>>%
+      po("collapsefactors", target_level_count = 100, id = "svm_collapse") %>>%
+      po("encode", method = "one-hot", id = "svm_encode") %>>%
+      po("removeconstants", id = "svm_post_removeconstants") %>>%
+      lrn("classif.svm", id = "svm", type = "C-classification")
+    branches = c(branches, branch_svm)
+  }
+
+  # xgboost
+  if ("xgboost" %in% learner_ids) {
+    branch_xgboost = po("removeconstants", id = "xgboost_removeconstants") %>>%
+      po("imputeoor", id = "xgboost_imputeoor") %>>%
+      po("fixfactors", id = "xgboost_fixfactors") %>>%
+      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "xgboost_imputesample") %>>%
+      po("encodeimpact", id = "xgboost_encode") %>>%
+      po("removeconstants", id = "xgboost_post_removeconstants") %>>%
+      lrn("classif.xgboost", id = "xgboost", nrounds = 5000, early_stopping_rounds = 10)
+    branches = c(branches, branch_xgboost)
+  }
+
+  # catboost
+  if ("catboost" %in% learner_ids) {
+    branch_catboost = po("colapply", id = "catboost_colapply", applicator = as.numeric, affect_columns = selector_type("integer")) %>>%
+      lrn("classif.catboost", id = "catboost", iterations = 500, early_stopping_rounds = 10, use_best_model = TRUE)
+    branches = c(branches, branch_catboost)
+  }
+
+  # extra trees
+  if ("extra_trees" %in% learner_ids) {
+    branch_extra_trees = po("removeconstants", id = "extra_trees_removeconstants") %>>%
+      po("imputeoor", id = "extra_trees_imputeoor") %>>%
+      po("fixfactors", id = "extra_trees_fixfactors") %>>%
+      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "extra_trees_imputesample") %>>%
+      po("collapsefactors", target_level_count = 40, id = "extra_trees_collapse") %>>%
+      po("removeconstants", id = "extra_trees_post_removeconstants") %>>%
+      lrn("classif.ranger", id = "extra_trees", splitrule = "extratrees", num.trees = 100, replace = FALSE, sample.fraction = 1)
+    branches = c(branches, branch_extra_trees)
+  }
+
+  # lightgbm
+  if ("lightgbm" %in% learner_ids) {
+    branch_lightgbm = lrn("classif.lightgbm", id = "lightgbm", num_iterations = 5000, early_stopping_rounds = 10)
+    branches = c(branches, branch_lightgbm)
+  }
+
+  # branch graph
+  po("branch", options = learner_ids) %>>%
+    gunion(branches) %>>%
+    po("unbranch", options = learner_ids)
+}
+
+tuning_space = list(
+  glmnet = list(
+    glmnet.s     = to_tune(1e-4, 1e4, logscale = TRUE),
+    glmnet.alpha = to_tune(0, 1)
+  ),
+
+  kknn = list(
+    kknn.k = to_tune(1, 50, logscale = TRUE),
+    kknn.distance = to_tune(1, 5),
+    kknn.kernel = to_tune(c("rectangular", "optimal", "epanechnikov", "biweight", "triweight", "cos",  "inv",  "gaussian", "rank"))
+  ),
+
+  lda = list(),
+
+  extra_trees = list(),
+
+  nnet = list(
+      nnet.maxit = to_tune(1e1, 1e3, logscale = TRUE),
+      nnet.decay = to_tune(1e-4, 1e-1, logscale = TRUE),
+      nnet.size  = to_tune(2, 50, logscale = TRUE)
+  ),
+
+  ranger = list(
+    ranger.mtry.ratio      = to_tune(0, 1),
+    ranger.replace         = to_tune(),
+    ranger.sample.fraction = to_tune(1e-1, 1),
+    ranger.num.trees       = to_tune(500, 2000)
+  ),
+
+  svm = list(
+    svm.cost    = to_tune(1e-4, 1e4, logscale = TRUE),
+    svm.kernel  = to_tune(c("polynomial", "radial", "sigmoid", "linear")),
+    svm.degree  = to_tune(2, 5),
+    svm.gamma   = to_tune(1e-4, 1e4, logscale = TRUE)
+  ),
+
+  xgboost = list(
+    xgboost.eta               = to_tune(1e-4, 1, logscale = TRUE),
+    xgboost.max_depth         = to_tune(1, 20),
+    xgboost.colsample_bytree  = to_tune(1e-1, 1),
+    xgboost.colsample_bylevel = to_tune(1e-1, 1),
+    xgboost.lambda            = to_tune(1e-3, 1e3, logscale = TRUE),
+    xgboost.alpha             = to_tune(1e-3, 1e3, logscale = TRUE),
+    xgboost.subsample         = to_tune(1e-1, 1),
+    xgboost.nrounds           = to_tune(1, 5000, internal = TRUE)
+  ),
+
+  catboost = list(
+    catboost.depth          = to_tune(5, 8),
+    catboost.learning_rate  = to_tune(5e-3, 0.2, logscale = TRUE),
+    catboost.l2_leaf_reg    = to_tune(1, 5),
+    catboost.iterations     = to_tune(1, 500, internal = TRUE)
+  ),
+
+
+  lightgbm = list(
+    lightgbm.learning_rate    = to_tune(5e-3, 0.2, logscale = TRUE),
+    lightgbm.feature_fraction = to_tune(0.75, 1),
+    lightgbm.min_data_in_leaf = to_tune(2, 60),
+    lightgbm.num_leaves       = to_tune(16, 96),
+    lightgbm.num_iterations   = to_tune(1, 5000, internal = TRUE)
+  )
+)
diff --git a/R/LearnerRegrAuto.R b/R/LearnerRegrAuto.R
index 6d93d86..bee6e86 100644
--- a/R/LearnerRegrAuto.R
+++ b/R/LearnerRegrAuto.R
@@ -22,77 +22,17 @@ LearnerRegrAuto = R6Class("LearnerRegrAuto",
 
     #' @description
     #' Creates a new instance of this [R6][R6::R6Class] class.
-    initialize = function(id = "regr.auto") {
-      param_set = ps(
-        # learner
-        learner_ids = p_uty(default = c("glmnet", "kknn", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm"),
-          custom_check = function(x) {
-            if (length(x) == 1 && x == "extra_trees") {
-              return("Learner 'extra_trees' must be combined with other learners")
-            }
-            check_subset(x, c("glmnet", "kknn", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm"))
-        }),
-        learner_timeout = p_int(lower = 1L, default = 900L),
-        xgboost_eval_metric = p_uty(),
-        catboost_eval_metric = p_uty(),
-        lightgbm_eval_metric = p_uty(),
-        # system
-        max_nthread = p_int(lower = 1L, default = 1L),
-        max_memory = p_int(lower = 1L, default = 32000L),
-        # large data
-        large_data_size = p_int(lower = 1L, default = 1e6),
-        large_data_learner_ids = p_uty(),
-        large_data_nthread = p_int(lower = 1L, default = 4L),
-        # small data
-        small_data_size = p_int(lower = 1L, default = 5000L),
-        small_data_resampling = p_uty(),
-        max_cardinality = p_int(lower = 1L, default = 100L),
-        extra_trees_max_cardinality = p_int(lower = 1L, default = 40L),
-        # tuner
-        resampling = p_uty(),
-        terminator = p_uty(),
-        measure = p_uty(),
-        lhs_size = p_int(lower = 1L, default = 4L),
-        callbacks = p_uty(),
-        store_benchmark_result = p_lgl(default = FALSE))
+    initialize = function(id = "classif.auto") {
 
-      param_set$set_values(
-        learner_ids = c("glmnet", "kknn", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm"),
-        learner_timeout = 900L,
-        max_nthread = 1L,
-        max_memory = 32000L,
-        large_data_size = 1e6L,
-        large_data_learner_ids = c("ranger", "xgboost", "catboost", "extra_trees", "lightgbm"),
-        large_data_nthread = 4L,
-        small_data_size = 5000L,
-        small_data_resampling = rsmp("cv", folds = 10L),
-        max_cardinality = 100L,
-        extra_trees_max_cardinality = 40L,
-        resampling = rsmp("cv", folds = 3L),
-        terminator = trm("run_time", secs = 14400L),
-        measure = msr("regr.mse"),
-        lhs_size = 4L,
-        store_benchmark_result = FALSE)
-
-      super$initialize(
-        id = id,
-        task_type = "regr",
-        param_set = param_set,
-        packages = c("mlr3tuning", "mlr3learners", "mlr3pipelines", "mlr3mbo", "mlr3automl", "xgboost", "catboost", "lightgbm", "ranger", "nnet", "kknn", "glmnet", "e1071"),
-        feature_types = c("logical", "integer", "numeric", "character", "factor"),
-        predict_types = "response",
-        properties = c("missings", "weights")
-      )
     }
   ),
   private = list(
     .train = function(task) {
-      train_auto(self, task, task_type = "regr")
+
     },
 
     .predict = function(task) {
-      lg$debug("Predicting with '%s' on task '%s'", self$id, task$id)
-      self$model$graph_learner$predict(task)
+
     }
   )
 )
diff --git a/R/build_graph.R b/R/build_graph.R
deleted file mode 100644
index c863ba6..0000000
--- a/R/build_graph.R
+++ /dev/null
@@ -1,202 +0,0 @@
-build_graph = function(learner_ids, task_type) {
-  assert_choice(task_type, c("classif", "regr"))
-  learners_reg = c("glmnet", "kknn", "nnet", "ranger", "svm", "xgboost", "catboost", "extra_trees", "lightgbm")
-  if (task_type == "regr") {
-    assert_subset(learner_ids, learners_reg)
-  } else {
-    assert_subset(learner_ids, c(learners_reg, "lda"))
-  }
-
-  branches = list()
-  # glmnet
-  if ("glmnet" %in% learner_ids) {
-    branch_glmnet = po("removeconstants", id = "glmnet_removeconstants") %>>%
-      po("imputehist", id = "glmnet_imputehist") %>>%
-      po("imputeoor", id = "glmnet_imputeoor") %>>%
-      po("fixfactors", id = "glmnet_fixfactors") %>>%
-      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "glmnet_imputesample") %>>%
-      po("collapsefactors", target_level_count = 100, id = "glmnet_collapse") %>>%
-      po("encode", method = "one-hot", id = "glmnet_encode") %>>%
-      po("removeconstants", id = "glmnet_post_removeconstants") %>>%
-      lrn(paste0(task_type, ".glmnet"), id = "glmnet")
-    branches = c(branches, branch_glmnet)
-  }
-
-  # kknn
-  if ("kknn" %in% learner_ids) {
-    branch_kknn = po("removeconstants", id = "kknn_removeconstants") %>>%
-      po("imputehist", id = "kknn_imputehist") %>>%
-      po("imputeoor", id = "kknn_imputeoor") %>>%
-      po("fixfactors", id = "kknn_fixfactors") %>>%
-      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "kknn_imputesample") %>>%
-      po("collapsefactors", target_level_count = 100, id = "kknn_collapse") %>>%
-      po("removeconstants", id = "kknn_post_removeconstants") %>>%
-      lrn(paste0(task_type, ".kknn"), id = "kknn")
-    branches = c(branches, branch_kknn)
-  }
-
-  # lda
-  # only for classification
-  if ("lda" %in% learner_ids) {
-    branch_lda = po("removeconstants", id = "lda_removeconstants") %>>%
-      po("imputehist", id = "lda_imputehist") %>>%
-      po("imputeoor", id = "lda_imputeoor") %>>%
-      po("fixfactors", id = "lda_fixfactors") %>>%
-      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "lda_imputesample") %>>%
-      po("collapsefactors", target_level_count = 100, id = "lda_collapse") %>>%
-      po("removeconstants", id = "lda_post_removeconstants") %>>%
-      lrn("classif.lda", id = "lda")
-    branches = c(branches, branch_lda)
-  }
-
-  # nnet
-  if ("nnet" %in% learner_ids) {
-    branch_nnet = po("removeconstants", id = "nnet_removeconstants") %>>%
-      po("imputehist", id = "nnet_imputehist") %>>%
-      po("imputeoor", id = "nnet_imputeoor") %>>%
-      po("fixfactors", id = "nnet_fixfactors") %>>%
-      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "nnet_imputesample") %>>%
-      po("collapsefactors", target_level_count = 100, id = "nnet_collapse") %>>%
-      po("removeconstants", id = "nnet_post_removeconstants") %>>%
-      lrn(paste0(task_type, ".nnet"), id = "nnet")
-    branches = c(branches, branch_nnet)
-  }
-
-  # ranger
-  if ("ranger" %in% learner_ids) {
-    branch_ranger = po("removeconstants", id = "ranger_removeconstants") %>>%
-      po("imputeoor", id = "ranger_imputeoor") %>>%
-      po("fixfactors", id = "ranger_fixfactors") %>>%
-      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "ranger_imputesample") %>>%
-      po("collapsefactors", target_level_count = 100, id = "ranger_collapse") %>>%
-      po("removeconstants", id = "ranger_post_removeconstants") %>>%
-      # use upper bound of search space for memory estimation
-      lrn(paste0(task_type, ".ranger"), id = "ranger", num.trees = 2000)
-    branches = c(branches, branch_ranger)
-  }
-
-  # svm
-  if ("svm" %in% learner_ids) {
-    svm_type = if (task_type == "classif") {
-      "C-classification"
-    } else {
-      "eps-regression"
-    }
-    branch_svm = po("removeconstants", id = "svm_removeconstants") %>>%
-      po("imputehist", id = "svm_imputehist") %>>%
-      po("imputeoor", id = "svm_imputeoor") %>>%
-      po("fixfactors", id = "svm_fixfactors") %>>%
-      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "svm_imputesample") %>>%
-      po("collapsefactors", target_level_count = 100, id = "svm_collapse") %>>%
-      po("encode", method = "one-hot", id = "svm_encode") %>>%
-      po("removeconstants", id = "svm_post_removeconstants") %>>%
-      lrn(paste0(task_type, ".svm"), id = "svm", type = svm_type)
-    branches = c(branches, branch_svm)
-  }
-
-  # xgboost
-  if ("xgboost" %in% learner_ids) {
-    branch_xgboost = po("removeconstants", id = "xgboost_removeconstants") %>>%
-      po("imputeoor", id = "xgboost_imputeoor") %>>%
-      po("fixfactors", id = "xgboost_fixfactors") %>>%
-      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "xgboost_imputesample") %>>%
-      po("encodeimpact", id = "xgboost_encode") %>>%
-      po("removeconstants", id = "xgboost_post_removeconstants") %>>%
-      lrn(paste0(task_type, ".xgboost"), id = "xgboost", nrounds = 5000, early_stopping_rounds = 10)
-    branches = c(branches, branch_xgboost)
-  }
-
-  # catboost
-  if ("catboost" %in% learner_ids) {
-    branch_catboost = po("colapply", id = "catboost_colapply", applicator = as.numeric, affect_columns = selector_type("integer")) %>>%
-      lrn(paste0(task_type, ".catboost"), id = "catboost", iterations = 500, early_stopping_rounds = 10, use_best_model = TRUE)
-    branches = c(branches, branch_catboost)
-  }
-
-  # extra trees
-  if ("extra_trees" %in% learner_ids) {
-    branch_extra_trees = po("removeconstants", id = "extra_trees_removeconstants") %>>%
-      po("imputeoor", id = "extra_trees_imputeoor") %>>%
-      po("fixfactors", id = "extra_trees_fixfactors") %>>%
-      po("imputesample", affect_columns = selector_type(c("factor", "ordered")), id = "extra_trees_imputesample") %>>%
-      po("collapsefactors", target_level_count = 40, id = "extra_trees_collapse") %>>%
-      po("removeconstants", id = "extra_trees_post_removeconstants") %>>%
-      lrn(paste0(task_type, ".ranger"), id = "extra_trees", splitrule = "extratrees", num.trees = 100, replace = FALSE, sample.fraction = 1)
-    branches = c(branches, branch_extra_trees)
-  }
-
-  # lightgbm
-  if ("lightgbm" %in% learner_ids) {
-    branch_lightgbm = lrn(paste0(task_type, ".lightgbm"), id = "lightgbm", num_iterations = 5000, early_stopping_rounds = 10)
-    branches = c(branches, branch_lightgbm)
-  }
-
-  # branch graph
-  po("branch", options = learner_ids) %>>%
-    gunion(branches) %>>%
-    po("unbranch", options = learner_ids)
-}
-
-tuning_space = list(
-  glmnet = list(
-    glmnet.s     = to_tune(1e-4, 1e4, logscale = TRUE),
-    glmnet.alpha = to_tune(0, 1)
-  ),
-
-  kknn = list(
-    kknn.k = to_tune(1, 50, logscale = TRUE),
-    kknn.distance = to_tune(1, 5),
-    kknn.kernel = to_tune(c("rectangular", "optimal", "epanechnikov", "biweight", "triweight", "cos",  "inv",  "gaussian", "rank"))
-  ),
-
-  lda = list(),
-
-  extra_trees = list(),
-
-  nnet = list(
-      nnet.maxit = to_tune(1e1, 1e3, logscale = TRUE),
-      nnet.decay = to_tune(1e-4, 1e-1, logscale = TRUE),
-      nnet.size  = to_tune(2, 50, logscale = TRUE)
-  ),
-
-  ranger = list(
-    ranger.mtry.ratio      = to_tune(0, 1),
-    ranger.replace         = to_tune(),
-    ranger.sample.fraction = to_tune(1e-1, 1),
-    ranger.num.trees       = to_tune(500, 2000)
-  ),
-
-  svm = list(
-    svm.cost    = to_tune(1e-4, 1e4, logscale = TRUE),
-    svm.kernel  = to_tune(c("polynomial", "radial", "sigmoid", "linear")),
-    svm.degree  = to_tune(2, 5),
-    svm.gamma   = to_tune(1e-4, 1e4, logscale = TRUE)
-  ),
-
-  xgboost = list(
-    xgboost.eta               = to_tune(1e-4, 1, logscale = TRUE),
-    xgboost.max_depth         = to_tune(1, 20),
-    xgboost.colsample_bytree  = to_tune(1e-1, 1),
-    xgboost.colsample_bylevel = to_tune(1e-1, 1),
-    xgboost.lambda            = to_tune(1e-3, 1e3, logscale = TRUE),
-    xgboost.alpha             = to_tune(1e-3, 1e3, logscale = TRUE),
-    xgboost.subsample         = to_tune(1e-1, 1),
-    xgboost.nrounds           = to_tune(1, 5000, internal = TRUE)
-  ),
-
-  catboost = list(
-    catboost.depth          = to_tune(5, 8),
-    catboost.learning_rate  = to_tune(5e-3, 0.2, logscale = TRUE),
-    catboost.l2_leaf_reg    = to_tune(1, 5),
-    catboost.iterations     = to_tune(1, 500, internal = TRUE)
-  ),
-
-
-  lightgbm = list(
-    lightgbm.learning_rate    = to_tune(5e-3, 0.2, logscale = TRUE),
-    lightgbm.feature_fraction = to_tune(0.75, 1),
-    lightgbm.min_data_in_leaf = to_tune(2, 60),
-    lightgbm.num_leaves       = to_tune(16, 96),
-    lightgbm.num_iterations   = to_tune(1, 5000, internal = TRUE)
-  )
-)
\ No newline at end of file
diff --git a/R/train_auto.R b/R/train_auto.R
deleted file mode 100644
index 2ff174f..0000000
--- a/R/train_auto.R
+++ /dev/null
@@ -1,155 +0,0 @@
-train_auto = function(self, task, task_type) {
-  pv = self$param_set$values
-  learner_ids = pv$learner_ids
-  self$graph = build_graph(learner_ids, task_type)
-  self$tuning_space = tuning_space[learner_ids]
-
-  lg$debug("Training '%s' on task '%s'", self$id, task$id)
-
-  # initialize mbo tuner
-  tuner = tnr("adbo")
-
-  # remove learner based on memory limit
-  lg$debug("Starting to select from %i learners: %s", length(learner_ids), paste0(learner_ids, collapse = ","))
-
-  if (!is.null(pv$max_memory)) {
-    memory_usage = map_dbl(learner_ids, function(learner_id) {
-      self$graph$pipeops[[learner_id]]$learner$estimate_memory_usage(task) / 1e6
-    })
-    learner_ids = learner_ids[memory_usage < pv$max_memory]
-    lg$debug("Checking learners for memory limit of %i MB. Keeping %i learner(s): %s", pv$max_memory, length(learner_ids), paste0(learner_ids, collapse = ","))
-  }
-
-  # set number of threads
-  if (!is.null(pv$max_nthread)) {
-    lg$debug("Setting number of threads per learner to %i", pv$max_nthread)
-    walk(learner_ids, function(learner_id) {
-      set_threads(self$graph$pipeops[[learner_id]]$learner, pv$max_nthread)
-    })
-  }
-
-  # reduce number of workers on large data sets
-  if (!is.null(pv$large_data_size) && task$nrow * task$ncol > pv$large_data_size) {
-    lg$debug("Task size larger than %i rows", pv$large_data_size)
-
-    learner_ids = intersect(learner_ids, pv$large_data_learner_ids)
-    self$tuning_space = tuning_space[learner_ids]
-    lg$debug("Keeping %i learner(s): %s", length(learner_ids), paste0(learner_ids, collapse = ","))
-
-    lg$debug("Increasing number of threads per learner to %i", pv$large_data_nthread)
-    walk(learner_ids, function(learner_id) {
-      set_threads(self$graph$pipeops[[learner_id]]$learner, pv$large_data_nthread)
-    })
-    n_workers = rush_config()$n_workers
-    n = max(1, floor(n_workers / pv$large_data_nthread))
-    tuner$param_set$set_values(n_workers = n)
-    lg$debug("Reducing number of workers to %i", n)
-  }
-
-  # small data resampling
-  resampling = if (!is.null(pv$small_data_size) && task$nrow < pv$small_data_size) {
-    lg$debug("Task has less than %i rows", pv$small_data_size)
-    lg$debug("Using small data set resampling with %i iterations", pv$small_data_resampling$iters)
-    pv$small_data_resampling
-  } else {
-    pv$resampling
-  }
-
-  # cardinality
-  cardinality = map_int(task$col_info$levels, length)
-  if (!is.null(pv$max_cardinality) && any(cardinality > pv$max_cardinality)) {
-    lg$debug("Reducing number of factor levels to %i", pv$max_cardinality)
-
-    # collapse factors
-    pipeop_ids = names(self$graph$pipeops)
-    pipeop_ids = pipeop_ids[grep("collapse", pipeop_ids)]
-    walk(pipeop_ids, function(pipeop_id) {
-      self$graph$pipeops[[pipeop_id]]$param_set$values$target_level_count = pv$max_cardinality
-    })
-  }
-
-  if ("extra_trees" %in% learner_ids && any(cardinality > pv$extra_trees_max_cardinality))  {
-    lg$debug("Reducing number of factor levels to %i for extra trees", pv$extra_trees_max_cardinality)
-    self$graph$pipeops$extra_trees_collapse$param_set$values$target_level_count = pv$extra_trees_max_cardinality
-  }
-
-  # initialize graph learner
-  graph_learner = as_learner(self$graph)
-  graph_learner$id = "graph_learner"
-  graph_learner$predict_type = pv$measure$predict_type
-  graph_learner$fallback = lrn(paste0(task_type, ".featureless"), predict_type = pv$measure$predict_type)
-  graph_learner$encapsulate = c(train = "callr", predict = "callr")
-  graph_learner$timeout = c(train = pv$learner_timeout, predict = pv$learner_timeout)
-
-  learners_with_validation = intersect(learner_ids, c("xgboost", "catboost", "lightgbm"))
-  if (length(learners_with_validation)) {
-    set_validate(graph_learner, "test", ids = learners_with_validation)
-  }
-
-  # set early stopping
-  if ("xgboost" %in% learner_ids) {
-    graph_learner$param_set$values$xgboost.callbacks = list(cb_timeout_xgboost(pv$learner_timeout * 0.8))
-    graph_learner$param_set$values$xgboost.eval_metric = pv$xgboost_eval_metric
-  }
-  if ("catboost" %in% learner_ids) {
-    graph_learner$param_set$values$catboost.eval_metric = pv$catboost_eval_metric
-  }
-  if ("lightgbm" %in% learner_ids) {
-    graph_learner$param_set$values$lightgbm.callbacks = list(cb_timeout_lightgbm(pv$learner_timeout * 0.8))
-    graph_learner$param_set$values$lightgbm.eval = pv$lightgbm_eval_metric
-  }
-
-  # initialize search space
-  tuning_space = unlist(unname(self$tuning_space), recursive = FALSE)
-  graph_scratch = graph_learner$clone(deep = TRUE)
-  graph_scratch$param_set$set_values(.values = tuning_space)
-  graph_scratch$param_set$set_values(branch.selection = to_tune(learner_ids))
-  search_space = graph_scratch$param_set$search_space()
-  walk(learner_ids, function(learner_id) {
-    param_ids = search_space$ids()
-    param_ids = grep(paste0("^", learner_id), param_ids, value = TRUE)
-    walk(param_ids, function(param_id) {
-      # skip internal tuning parameter
-      if (param_id %in% c("xgboost.nrounds", "catboost.iterations", "lightgbm.num_iterations")) return()
-      search_space$add_dep(
-        id = param_id,
-        on = "branch.selection",
-        cond = CondEqual$new(learner_id)
-      )
-    })
-  })
-
-  # initial design
-  lhs_xdt = generate_lhs_design(pv$lhs_size, self$task_type, setdiff(learner_ids, c("lda", "extra_trees")), self$tuning_space)
-  default_xdt = generate_default_design(self$task_type, learner_ids, task, self$tuning_space)
-  initial_xdt = rbindlist(list(lhs_xdt, default_xdt), use.names = TRUE, fill = TRUE)
-  setorderv(initial_xdt, "branch.selection")
-  tuner$param_set$set_values(initial_design = initial_xdt)
-
-  # initialize auto tuner
-  self$instance = ti_async(
-    task = task,
-    learner = graph_learner,
-    resampling = resampling,
-    measures = pv$measure,
-    terminator = pv$terminator,
-    search_space = search_space,
-    callbacks = pv$callbacks,
-    store_benchmark_result = pv$store_benchmark_result
-  )
-
-  # tune
-  lg$debug("Learner '%s' starts tuning phase", self$id)
-  tuner$optimize(self$instance)
-
-  # fit final model
-  lg$debug("Learner '%s' fits final model", self$id)
-  if (length(learners_with_validation)) {
-    set_validate(graph_learner, NULL, ids = intersect(learner_ids, c("xgboost", "catboost", "lightgbm")))
-  }
-  graph_learner$param_set$set_values(.values = self$instance$result_learner_param_vals, .insert = FALSE)
-  graph_learner$timeout = c(train = Inf, predict = Inf)
-  graph_learner$train(task)
-
-  list(graph_learner = graph_learner, instance = self$instance)
-}
\ No newline at end of file
diff --git a/tests/testthat/test_LearnerClassifAuto.R b/tests/testthat/test_LearnerClassifAuto.R
index 4ea0f26..7eebf33 100644
--- a/tests/testthat/test_LearnerClassifAuto.R
+++ b/tests/testthat/test_LearnerClassifAuto.R
@@ -211,7 +211,7 @@ test_that("extra_trees and glmnet works", {
   )
 
   expect_class(learner$train(task), "LearnerClassifAuto")
-  expect_choice(learner$model$instance$result$branch.selection, c("extra_trees", "glmnet"))
+  expect_equal(learner$model$instance$result$branch.selection, "extra_trees")
 })
 
 test_that("lightgbm works", {
diff --git a/tests/testthat/test_LearnerRegrAuto.R b/tests/testthat/test_LearnerRegrAuto.R
deleted file mode 100644
index 4b274cc..0000000
--- a/tests/testthat/test_LearnerRegrAuto.R
+++ /dev/null
@@ -1,296 +0,0 @@
-test_that("glmnet works (regr)", {
-  rush_plan(n_workers = 2)
-  skip_if_not_installed("glmnet")
-
-  task = tsk("boston_housing")
-  learner = lrn("regr.auto",
-    learner_ids = "glmnet",
-    small_data_size = 1,
-    resampling = rsmp("holdout"),
-    measure = msr("regr.mse"),
-    terminator = trm("evals", n_evals = 6)
-  )
-
-  expect_class(learner$train(task), "LearnerRegrAuto")
-  expect_equal(learner$graph$param_set$values$branch.selection, "glmnet")
-  expect_equal(learner$model$instance$result$branch.selection, "glmnet")
-})
-
-test_that("kknn works (regr)", {
-  rush_plan(n_workers = 2)
-  skip_if_not_installed("kknn")
-
-  task = tsk("boston_housing")
-  learner = lrn("regr.auto",
-    learner_ids = "kknn",
-    small_data_size = 1,
-    resampling = rsmp("holdout"),
-    measure = msr("regr.mse"),
-    terminator = trm("evals", n_evals = 6)
-  )
-
-  expect_class(learner$train(task), "LearnerRegrAuto")
-  expect_equal(learner$graph$param_set$values$branch.selection, "kknn")
-  expect_equal(learner$model$instance$result$branch.selection, "kknn")
-})
-
-test_that("nnet works (regr)", {
-  rush_plan(n_workers = 2)
-  skip_if_not_installed("nnet")
-
-  task = tsk("boston_housing")
-  learner = lrn("regr.auto",
-    learner_ids = "nnet",
-    resampling = rsmp("holdout"),
-    small_data_size = 1,
-    measure = msr("regr.mse"),
-    terminator = trm("evals", n_evals = 6)
-  )
-
-  expect_class(learner$train(task), "LearnerRegrAuto")
-  expect_equal(learner$model$instance$result$branch.selection, "nnet")
-})
-
-test_that("ranger works (regr)", {
-  rush_plan(n_workers = 2)
-  skip_if_not_installed("ranger")
-
-
-  task = tsk("boston_housing")
-  learner = lrn("regr.auto",
-    learner_ids = "ranger",
-    small_data_size = 1,
-    resampling = rsmp("holdout"),
-    measure = msr("regr.mse"),
-    terminator = trm("evals", n_evals = 6)
-  )
-
-  expect_class(learner$train(task), "LearnerRegrAuto")
-  expect_equal(learner$model$instance$result$branch.selection, "ranger")
-})
-
-test_that("svm works (regr)", {
-  rush_plan(n_workers = 2)
-  skip_if_not_installed("e1071")
-
-
-  task = tsk("boston_housing")
-  learner = lrn("regr.auto",
-    learner_ids = "svm",
-    small_data_size = 1,
-    resampling = rsmp("holdout"),
-    measure = msr("regr.mse"),
-    terminator = trm("evals", n_evals = 6)
-  )
-
-  expect_class(learner$train(task), "LearnerRegrAuto")
-  expect_equal(learner$model$instance$result$branch.selection, "svm")
-})
-
-test_that("xgboost works (regr)", {
-  skip_if_not_installed("xgboost")
-  rush_plan(n_workers = 2)
-
-  task = tsk("boston_housing")
-  learner = lrn("regr.auto",
-    learner_ids = "xgboost",
-    small_data_size = 1,
-    xgboost_eval_metric = "mlogloss",
-    resampling = rsmp("holdout"),
-    measure = msr("regr.mse"),
-    terminator = trm("evals", n_evals = 6)
-  )
-
-  expect_class(learner$train(task), "LearnerRegrAuto")
-  expect_equal(learner$model$instance$result$branch.selection, "xgboost")
-})
-
-test_that("catboost works (regr)", {
-  skip_if_not_installed("catboost")
-  rush_plan(n_workers = 2)
-
-  task = tsk("boston_housing")
-  learner = lrn("regr.auto",
-    learner_ids = "catboost",
-    small_data_size = 1,
-    # catboost_eval_metric = "MultiClass",
-    resampling = rsmp("holdout"),
-    measure = msr("regr.mse"),
-    terminator = trm("evals", n_evals = 6)
-  )
-
-  expect_class(learner$train(task), "LearnerRegrAuto")
-  expect_equal(learner$model$instance$result$branch.selection, "catboost")
-})
-
-test_that("only extra_trees fails", {
-  rush_plan(n_workers = 2)
-
-  task = tsk("boston_housing")
-  expect_error(lrn("regr.auto",
-    learner_ids = "extra_trees",
-    resampling = rsmp("holdout"),
-    measure = msr("regr.mse"),
-    terminator = trm("evals", n_evals = 6)
-  ), "must be combined with other learners")
-})
-
-test_that("extra_trees and glmnet works (regr)", {
-  skip_if_not_installed("glmnet")
-  rush_plan(n_workers = 2)
-
-  task = tsk("boston_housing")
-  learner = lrn("regr.auto",
-    learner_ids = c("extra_trees", "glmnet"),
-    resampling = rsmp("holdout"),
-    measure = msr("regr.mse"),
-    terminator = trm("evals", n_evals = 6)
-  )
-
-  expect_class(learner$train(task), "LearnerRegrAuto")
-  expect_choice(learner$model$instance$result$branch.selection, c("extra_trees", "glmnet"))
-})
-
-test_that("lightgbm works (regr)", {
-  skip_if_not_installed("lightgbm")
-  rush_plan(n_workers = 2)
-
-  task = tsk("boston_housing")
-  learner = lrn("regr.auto",
-    learner_ids = "lightgbm",
-    lightgbm_eval_metric = "multi_logloss",
-    resampling = rsmp("holdout"),
-    measure = msr("regr.mse"),
-    terminator = trm("evals", n_evals = 6)
-  )
-
-  expect_class(learner$train(task), "LearnerRegrAuto")
-  expect_equal(learner$model$instance$result$branch.selection, "lightgbm")
-})
-
-test_that("xgboost, catboost and lightgbm work (regr)", {
-  skip_if_not_installed(c("xgboost", "catboost", "lightgbm"))
-  rush_plan(n_workers = 2)
-
-  task = tsk("boston_housing")
-  learner = lrn("regr.auto",
-    learner_ids = c("xgboost", "catboost", "lightgbm"),
-    # catboost_eval_metric = "MultiClass",
-    # lightgbm_eval_metric = "multi_logloss",
-    # xgboost_eval_metric = "mlogloss",
-    resampling = rsmp("holdout"),
-    lhs_size = 1,
-    measure = msr("regr.mse"),
-    terminator = trm("evals", n_evals = 20),
-    callbacks = clbk("mlr3tuning.async_save_logs")
-  )
-
-  expect_class(learner$train(task), "LearnerRegrAuto")
-})
-
-test_that("all learner work (regr)", {
-  skip_if_not_installed(c("glmnet", "kknn", "nnet", "ranger", "e1071", "xgboost", "catboost", "MASS", "lightgbm"))
-  rush_plan(n_workers = 2)
-
-  task = tsk("boston_housing")
-  learner = lrn("regr.auto",
-    small_data_size = 100,
-    measure = msr("regr.mse"),
-    terminator = trm("evals", n_evals = 20),
-    lhs_size = 1
-  )
-
-  expect_class(learner$train(task), "LearnerRegrAuto")
-  expect_class(learner$model$instance, "TuningInstanceAsyncSingleCrit")
-  expect_prediction(learner$predict(task))
-})
-
-# test_that("memory limit works", {
-#   skip_if_not_installed(c("glmnet", "kknn", "nnet", "ranger", "e1071", "xgboost", "catboost", "MASS", "lightgbm"))
-#   rush_plan(n_workers = 2)
-
-#   task = tsk("spam")
-#   learner = lrn("regr.auto",
-#     max_memory = 50,
-#     small_data_size = 100,
-#     measure = msr("regr.mse"),
-#     terminator = trm("evals", n_evals = 20),
-#     resampling = rsmp("holdout"),
-#     lhs_size = 1
-#   )
-
-#   learner$train(task)
-# })
-
-test_that("small data set switch works (regr)", {
-  skip_if_not_installed(c("glmnet", "kknn", "nnet", "ranger", "e1071", "xgboost", "catboost", "lightgbm"))
-  rush_plan(n_workers = 2)
-
-  task = tsk("boston_housing")
-  learner = lrn("regr.auto",
-    small_data_size = 1000,
-    small_data_resampling = rsmp("cv", folds = 2),
-    measure = msr("regr.mse"),
-    terminator = trm("evals", n_evals = 1),
-    lhs_size = 1,
-    store_benchmark_result = TRUE
-  )
-
-  expect_class(learner$train(task), "LearnerRegrAuto")
-  expect_equal(learner$model$instance$archive$benchmark_result$resamplings$resampling[[1]]$iters, 2)
-})
-
-test_that("large data set switch works (regr)", {
-  skip_if_not_installed(c("glmnet", "kknn", "nnet", "ranger", "e1071", "xgboost", "catboost", "lightgbm"))
-  rush_plan(n_workers = 2)
-
-  task = tsk("boston_housing")
-  learner = lrn("regr.auto",
-    large_data_size = 100,
-    large_data_nthread = 4,
-    large_data_learner_ids = "ranger",
-    small_data_size = 100,
-    measure = msr("regr.mse"),
-    terminator = trm("evals", n_evals = 1),
-    lhs_size = 1,
-    store_benchmark_result = TRUE
-  )
-
-  expect_class(learner$train(task), "LearnerRegrAuto")
-  expect_set_equal(learner$model$instance$archive$data$branch.selection, "ranger")
-})
-
-test_that("max_cardinality works (regr)", {
-  skip_if_not_installed(c("glmnet", "kknn", "nnet", "ranger", "e1071", "xgboost", "catboost", "lightgbm"))
-  rush_plan(n_workers = 2)
-
-  task = tsk("boston_housing")
-  learner = lrn("regr.auto",
-    small_data_size = 1,
-    resampling = rsmp("holdout"),
-    max_cardinality = 2,
-    measure = msr("regr.mse"),
-    terminator = trm("evals", n_evals = 1),
-    lhs_size = 1
-  )
-
-  expect_class(learner$train(task), "LearnerRegrAuto")
-})
-
-test_that("max_cardinality works for extra trees (regr)", {
-  skip_if_not_installed(c("glmnet", "kknn", "nnet", "ranger", "e1071", "xgboost", "catboost", "lightgbm"))
-  rush_plan(n_workers = 2)
-
-  task = tsk("boston_housing")
-  learner = lrn("regr.auto",
-    small_data_size = 1,
-    resampling = rsmp("holdout"),
-    max_cardinality = 3,
-    extra_trees_max_cardinality = 2,
-    measure = msr("regr.mse"),
-    terminator = trm("evals", n_evals = 1),
-    lhs_size = 1
-  )
-
-  expect_class(learner$train(task), "LearnerRegrAuto")
-})

From ae13f949d8259eae5f2ec91b7d3432d070ec1d52 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Tue, 13 Aug 2024 14:34:20 +0200
Subject: [PATCH 22/34] chore: collate

---
 DESCRIPTION | 1 +
 1 file changed, 1 insertion(+)

diff --git a/DESCRIPTION b/DESCRIPTION
index 938ca9c..ae3d58a 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -62,5 +62,6 @@ Collate:
     'LearnerClassifAutoSVM.R'
     'LearnerClassifAutoXgboost.R'
     'LearnerRegrAuto.R'
+    'save_deepcave_run.R'
     'helper.R'
     'zzz.R'

From f3bca1e0872d82a6b3ef9b8cd08b2e63261eda15 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Tue, 13 Aug 2024 15:18:03 +0200
Subject: [PATCH 23/34] fix: global variables

---
 R/save_deepcave_run.R | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R
index cbba48f..500dcca 100644
--- a/R/save_deepcave_run.R
+++ b/R/save_deepcave_run.R
@@ -96,7 +96,7 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run",
 
 # Prepare the list for converting to `configs.json`
 get_configs = function(instance){
-  param_ids = instance$search_space$data[, id]
+  param_ids = instance$search_space$data$id
   logscale_params = param_ids[instance$search_space$is_logscale[param_ids]]
   config_table = instance$archive$data[, param_ids, with = FALSE]
   config_table[, (logscale_params) := lapply(.SD, exp), .SDcols = logscale_params]
@@ -113,9 +113,10 @@ get_configs = function(instance){
 # Prepare the list for converting to `configspace.json`
 get_configspace = function(instance) {
   n_params = nrow(instance$search_space$data)
-  param_ids = instance$search_space$data[, id]
+  param_ids = instance$search_space$data$id
 
   hyperparameters_list = map(param_ids, function(param_id) {
+    id = NULL # resolve global variable note in R CDM check
     row = instance$search_space$data[id == param_id, ]
 
     type = switch(row[["class"]],
@@ -160,11 +161,13 @@ get_configspace = function(instance) {
   })
 
   conditions_list = map(setdiff(param_ids, "branch.selection"), function(param_id) {
+    id = NULL # resolve global variable note in R CDM check
     dependency = instance$search_space$deps[id == param_id, ]
     # `svm.degree` and `svm.gamma` depends on `svm.kernel` as well as `branch.selection`.
     # DeepCAVE does not allow one parameter to be conditioned on multiple others.
     # So remove their dependency on `branch.selection`.
     if (nrow(dependency) > 1) {
+      on = NULL # resolve global variable note in R CDM check
       dependency = dependency[on != "branch.selection", ]
     }
     child = param_id
@@ -174,7 +177,7 @@ get_configspace = function(instance) {
     # Currently, there are only 'CondEqual' and 'CondAnyOf', which should not be used simultaneously.
     # So this list should always contain only one entry.
     cond = dependency[["cond"]][[1]]
-    if (is(cond, "CondEqual")) {
+    if (class(cond)[[1]] == "CondEqual") {
         return(list(child = child, parent = parent, type = "EQ", value = cond$rhs))
     }
     return(list(child = child, parent = parent, type = "IN", values = cond$rhs))
@@ -189,10 +192,11 @@ get_configspace = function(instance) {
 
 # Prepare the data.table for converting to `history.jsonl`
 get_history = function(instance) {
-  costs = c(instance$objective$codomain$data[, id], "runtime_learners")
+  costs = c(instance$objective$codomain$data$id, "runtime_learners")
 
   selected_cols = c(costs, "timestamp_xs", "timestamp_ys", "state")
-  history_table = instance$archive$data[, ..selected_cols][, .(
+  timestamp_xs = timestamp_ys = state = NULL # resolve global variable note in R CDM check
+  history_table = instance$archive$data[, selected_cols, with = FALSE][, list(
     config_id = seq_len(nrow(instance$archive$data)) - 1,
     budget = 0,
     seed = -1,
@@ -216,7 +220,7 @@ get_history = function(instance) {
 # Prepare the list for converting to 'meta.json'
 get_meta = function(instance){
   # time is handled separately below
-  costs = instance$objective$codomain$data[, id]
+  costs = instance$objective$codomain$data$id
 
   objectives_list = map(costs, function(cost) {
     measure = msr(cost)
@@ -225,7 +229,7 @@ get_meta = function(instance){
     if (is.finite(lower)) {
       lock_lower = TRUE
     } else {
-      lower = min(instance$archive$data[, ..cost])
+      lower = min(instance$archive$data[, cost, with = FALSE])
       lock_lower = FALSE
     }
 
@@ -233,7 +237,7 @@ get_meta = function(instance){
     if (is.finite(upper)) {
       lock_upper = TRUE
     } else {
-      upper = max(instance$archive$data[, ..cost])
+      upper = max(instance$archive$data[, cost, with = FALSE])
       lock_upper = FALSE
     }
 
@@ -250,7 +254,7 @@ get_meta = function(instance){
   objectives_list = c(objectives_list, list(list(
     name = "time",
     lower = 0,
-    upper = max(instance$archive$data[, runtime_learners]),
+    upper = max(instance$archive$data[, "runtime_learners", with = FALSE]),
     lock_lower = TRUE,
     lock_upper = FALSE,
     optimize = "lower"

From c306d7c3465884dcc261d82b374200c87b1952f3 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Tue, 13 Aug 2024 15:22:24 +0200
Subject: [PATCH 24/34] test: no overwrite

---
 tests/testthat/test_save_deepcave_run.R | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/testthat/test_save_deepcave_run.R b/tests/testthat/test_save_deepcave_run.R
index d24cd47..ffc0717 100644
--- a/tests/testthat/test_save_deepcave_run.R
+++ b/tests/testthat/test_save_deepcave_run.R
@@ -1,4 +1,4 @@
-test_that("run is saved", {
+test_that("run is saved without overwriting", {
   rush_plan(n_workers = 2)
   skip_if_not_installed("e1071")
 
@@ -13,7 +13,13 @@ test_that("run is saved", {
   learner$train(task)
 
   dir = tempdir()
-  expected_path = file.path(dir, "test-1-run_1")
+  prefix = "test-1-run"
+  previous_run = file.path(dir, prefix, "run_1")
+  if (!file.exists(previous_run)) {
+    dir.create(previous_run)
+  }
+  
+  expected_path = file.path(dir, "test-1-run_2")
   if (file.exists(expected_path)) {
     lapply(list.files(expected_path, full.names = TRUE), file.remove)
     file.remove(expected_path)

From 3bb2d9c80dccba01eaf605d90d4575cd79fac7a6 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Tue, 13 Aug 2024 15:39:19 +0200
Subject: [PATCH 25/34] fix: save without overwriting

---
 R/save_deepcave_run.R                   | 16 +++++++++++-----
 tests/testthat/test_save_deepcave_run.R | 15 ++++++++-------
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R
index 500dcca..ee06c4c 100644
--- a/R/save_deepcave_run.R
+++ b/R/save_deepcave_run.R
@@ -28,18 +28,24 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run",
   # (https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/recorder.py)
   if (!overwrite) {
     new_idx = 0
-    walk(list.files(path), function(fn) {
-      if (!startsWith(fn, "prefix")) return()
-      idx = last(strsplit(fn, "_")[[1]])
-      if (is.numeric(idx)) {
+    for (fn in list.files(path)) {
+      if (!startsWith(fn, prefix)) next
+
+      splitted = strsplit(fn, "_")[[1]]
+      if (length(splitted) == 1) next # no run index attached
+
+      idx = suppressWarnings(last(splitted))
+      if (!is.na(idx)) { # idx is successfully coerced to a number
         idx_int = as.integer(idx)
         if (idx_int > new_idx) {
           new_idx = idx_int
         }
       }
-    })
+    }
+
     new_idx = new_idx + 1
     run_path = file.path(path, paste0(prefix, "_", new_idx))
+
     dir.create(run_path)
   } else {
     run_path = file.path(path, prefix)
diff --git a/tests/testthat/test_save_deepcave_run.R b/tests/testthat/test_save_deepcave_run.R
index ffc0717..0460c2d 100644
--- a/tests/testthat/test_save_deepcave_run.R
+++ b/tests/testthat/test_save_deepcave_run.R
@@ -1,4 +1,12 @@
 test_that("run is saved without overwriting", {
+  dir = tempdir()
+  prefix = "test-1-run"
+  dir.create(file.path(dir, prefix))
+  previous_run = file.path(dir, prefix, "run_1")
+  if (!file.exists(previous_run)) {
+    dir.create(previous_run)
+  }
+  
   rush_plan(n_workers = 2)
   skip_if_not_installed("e1071")
 
@@ -11,13 +19,6 @@ test_that("run is saved without overwriting", {
     terminator = trm("evals", n_evals = 6)
   )
   learner$train(task)
-
-  dir = tempdir()
-  prefix = "test-1-run"
-  previous_run = file.path(dir, prefix, "run_1")
-  if (!file.exists(previous_run)) {
-    dir.create(previous_run)
-  }
   
   expected_path = file.path(dir, "test-1-run_2")
   if (file.exists(expected_path)) {

From 05f6587bf3e3b70d56695c3690f2d606a20989a1 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Tue, 13 Aug 2024 15:41:43 +0200
Subject: [PATCH 26/34] build: update

---
 DESCRIPTION              |  2 +-
 NAMESPACE                |  1 +
 man/save_deepcave_run.Rd | 29 +++++++++++++++++++++++++++++
 3 files changed, 31 insertions(+), 1 deletion(-)
 create mode 100644 man/save_deepcave_run.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index ae3d58a..b3e3839 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -62,6 +62,6 @@ Collate:
     'LearnerClassifAutoSVM.R'
     'LearnerClassifAutoXgboost.R'
     'LearnerRegrAuto.R'
-    'save_deepcave_run.R'
     'helper.R'
+    'save_deepcave_run.R'
     'zzz.R'
diff --git a/NAMESPACE b/NAMESPACE
index 6fc0f11..14bb580 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -4,6 +4,7 @@ export(LearnerClassifAuto)
 export(LearnerClassifAutoSVM)
 export(LearnerClassifAutoXgboost)
 export(LearnerRegrAuto)
+export(save_deepcave_run)
 import(R6)
 import(checkmate)
 import(data.table)
diff --git a/man/save_deepcave_run.Rd b/man/save_deepcave_run.Rd
new file mode 100644
index 0000000..ab37cc8
--- /dev/null
+++ b/man/save_deepcave_run.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/save_deepcave_run.R
+\name{save_deepcave_run}
+\alias{save_deepcave_run}
+\title{Save Tuning History as a DeepCAVE Run}
+\usage{
+save_deepcave_run(
+  instance,
+  path = "logs/mlr3automl",
+  prefix = "run",
+  overwrite = FALSE
+)
+}
+\arguments{
+\item{instance}{(\link{TuningInstanceAsyncSingleCrit})
+Tuning instance to save.}
+
+\item{path}{(\code{character(1)})
+Path to save the run. Defaults to \verb{"logs/mlr3automl}.}
+
+\item{prefix}{(\code{character(1)})
+Prefix for the name of a new subfolder under \code{path} for storing the current run.}
+
+\item{overwrite}{(\code{character(1)})
+If \code{FALSE} (default), creates a new subfolder to save the current run. If \code{TRUE}, all existing runs will be deleted.}
+}
+\description{
+Exports information stored in a \code{TuningInstance} in a format recognized by \href{https://automl.github.io/DeepCAVE/main/index.html}{DeepCAVE} as a run. Each run is stored as a folder containing five files \code{configs.json}, \code{configspace.json}, \code{history.jsonl}, \code{meta.json}, and \code{origins.json}.
+}

From 747b7050052656f00f5d04ad6bf0ea5c0d4d63d4 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Tue, 13 Aug 2024 16:26:44 +0200
Subject: [PATCH 27/34] fix: save path

---
 R/save_deepcave_run.R                   | 4 ++--
 tests/testthat/test_save_deepcave_run.R | 9 +++------
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R
index ee06c4c..f62dc76 100644
--- a/R/save_deepcave_run.R
+++ b/R/save_deepcave_run.R
@@ -60,7 +60,7 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run",
   # `configspace.json`
   jsonlite::write_json(
     get_configspace(instance),
-    paste0(run_path, "/configspace.json"),
+    file.path(run_path, "configspace.json"),
     auto_unbox = TRUE, pretty = TRUE, null = "null"
   )
 
@@ -94,7 +94,7 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run",
   names(origins) = seq(nrow(instance$archive$data)) - 1
   jsonlite::write_json(
     origins,
-    paste0(run_path, "/origins.json"),
+    file.path(run_path, "origins.json"),
     pretty = TRUE, null = "null"
   )
 }
diff --git a/tests/testthat/test_save_deepcave_run.R b/tests/testthat/test_save_deepcave_run.R
index 0460c2d..4a3f85b 100644
--- a/tests/testthat/test_save_deepcave_run.R
+++ b/tests/testthat/test_save_deepcave_run.R
@@ -1,11 +1,8 @@
 test_that("run is saved without overwriting", {
   dir = tempdir()
-  prefix = "test-1-run"
-  dir.create(file.path(dir, prefix))
-  previous_run = file.path(dir, prefix, "run_1")
-  if (!file.exists(previous_run)) {
-    dir.create(previous_run)
-  }
+
+  previous_run = file.path(dir, "test-1-run_1")
+  dir.create(previous_run, showWarnings = FALSE)
   
   rush_plan(n_workers = 2)
   skip_if_not_installed("e1071")

From 9e0739cb712a29e4146bb92eef878f85e97e8b38 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Fri, 16 Aug 2024 13:05:28 +0200
Subject: [PATCH 28/34] fix: typo

---
 R/save_deepcave_run.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R
index f62dc76..8b3b6c1 100644
--- a/R/save_deepcave_run.R
+++ b/R/save_deepcave_run.R
@@ -140,7 +140,7 @@ get_configspace = function(instance) {
         choices = choices,
         # FIXME: `default` is wrong
         default = choices[[1]],
-        probabilisties = NULL
+        probabilities = NULL
       ))
     }
 

From ea05c84b12af2d622fdf5add97fd515ab7b4ba9f Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Fri, 16 Aug 2024 14:29:51 +0200
Subject: [PATCH 29/34] fix: default

---
 R/save_deepcave_run.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R
index 8b3b6c1..9583987 100644
--- a/R/save_deepcave_run.R
+++ b/R/save_deepcave_run.R
@@ -149,7 +149,7 @@ get_configspace = function(instance) {
     lower = row[["lower"]]
     upper = row[["upper"]]
     # FIXME: default is wrong
-    default = mean(lower, upper)
+    default = lower
     if (is_logscale) {
       lower = exp(lower)
       upper = exp(upper)

From f44b4c990a852fb3a59a1f73e7d211261b84729b Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Sun, 18 Aug 2024 22:46:18 +0200
Subject: [PATCH 30/34] fix: remove time objective

---
 R/save_deepcave_run.R | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R
index 9583987..7e396b0 100644
--- a/R/save_deepcave_run.R
+++ b/R/save_deepcave_run.R
@@ -83,7 +83,7 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run",
   jsonlite::stream_out(
     get_history(instance),
     con,
-    auto_unbox = TRUE, pretty = TRUE, null = "list", na = "null",
+    auto_unbox = FALSE, pretty = TRUE, null = "list", na = "null",
     dataframe = "values",
     verbose = FALSE
   )
@@ -198,7 +198,7 @@ get_configspace = function(instance) {
 
 # Prepare the data.table for converting to `history.jsonl`
 get_history = function(instance) {
-  costs = c(instance$objective$codomain$data$id, "runtime_learners")
+  costs = instance$objective$codomain$data$id
 
   selected_cols = c(costs, "timestamp_xs", "timestamp_ys", "state")
   timestamp_xs = timestamp_ys = state = NULL # resolve global variable note in R CDM check
@@ -225,7 +225,6 @@ get_history = function(instance) {
 
 # Prepare the list for converting to 'meta.json'
 get_meta = function(instance){
-  # time is handled separately below
   costs = instance$objective$codomain$data$id
 
   objectives_list = map(costs, function(cost) {
@@ -256,15 +255,6 @@ get_meta = function(instance){
     return(list(name = cost, lower = lower, upper = upper,
       lock_lower = lock_lower, lock_upper = lock_upper, optimize = optimize))
   })
-
-  objectives_list = c(objectives_list, list(list(
-    name = "time",
-    lower = 0,
-    upper = max(instance$archive$data[, "runtime_learners", with = FALSE]),
-    lock_lower = TRUE,
-    lock_upper = FALSE,
-    optimize = "lower"
-  )))
   
   return(list(
     objectives = objectives_list,

From 0c3005aaf9d40108a7cd2b84403225641c146fe9 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Sun, 18 Aug 2024 23:13:20 +0200
Subject: [PATCH 31/34] feat: skip branch.selection if only one branch

---
 R/save_deepcave_run.R | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R
index 7e396b0..8e8809e 100644
--- a/R/save_deepcave_run.R
+++ b/R/save_deepcave_run.R
@@ -103,8 +103,14 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run",
 # Prepare the list for converting to `configs.json`
 get_configs = function(instance){
   param_ids = instance$search_space$data$id
-  logscale_params = param_ids[instance$search_space$is_logscale[param_ids]]
+  # skip branch.selection if there is only one level
+  nbranches = instance$search_space$data[id == "branch.selection", "nlevels", with = FALSE]
+  if (nbranches == 1) {
+    param_ids = setdiff(param_ids, "branch.selection")
+  }
+
   config_table = instance$archive$data[, param_ids, with = FALSE]
+  logscale_params = param_ids[instance$search_space$is_logscale[param_ids]]
   config_table[, (logscale_params) := lapply(.SD, exp), .SDcols = logscale_params]
 
   configs_list = map(seq_len(nrow(config_table)), function(i) {
@@ -125,6 +131,9 @@ get_configspace = function(instance) {
     id = NULL # resolve global variable note in R CDM check
     row = instance$search_space$data[id == param_id, ]
 
+    # skip branch.selection if there is only one branch
+    if (param_id == "branch.selection" && row[["nlevels"]] == 1) return()
+
     type = switch(row[["class"]],
       ParamFct = "categorical",
       ParamLgl = "categorical",
@@ -165,6 +174,8 @@ get_configspace = function(instance) {
       q = NULL
     ))
   })
+  hyperparameters_list = discard(hyperparameters_list, is.null)
+
 
   conditions_list = map(setdiff(param_ids, "branch.selection"), function(param_id) {
     id = NULL # resolve global variable note in R CDM check
@@ -178,6 +189,10 @@ get_configspace = function(instance) {
     }
     child = param_id
     parent = dependency[["on"]]
+
+    # remove dependency on branch.selection if there is only one branch
+    nbranches = instance$search_space$data[id == "branch.selection", "nlevels", with = FALSE]
+    if (parent == "branch.selection" && nbranches == 1) return()
     
     # `cond` below is a list of `Condition`s.
     # Currently, there are only 'CondEqual' and 'CondAnyOf', which should not be used simultaneously.
@@ -188,6 +203,7 @@ get_configspace = function(instance) {
     }
     return(list(child = child, parent = parent, type = "IN", values = cond$rhs))
   })
+  conditions_list = discard(conditions_list, is.null)
 
   return(list(
     hyperparameters = hyperparameters_list,

From 8b2648bbdb179d039780f7c13b961f433b76b660 Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Sun, 18 Aug 2024 23:25:56 +0200
Subject: [PATCH 32/34] refactor: readability

---
 R/save_deepcave_run.R | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R
index 8e8809e..3c4d280 100644
--- a/R/save_deepcave_run.R
+++ b/R/save_deepcave_run.R
@@ -35,11 +35,11 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run",
       if (length(splitted) == 1) next # no run index attached
 
       idx = suppressWarnings(last(splitted))
-      if (!is.na(idx)) { # idx is successfully coerced to a number
-        idx_int = as.integer(idx)
-        if (idx_int > new_idx) {
-          new_idx = idx_int
-        }
+      if (is.na(idx)) next # idx cannot be coerced to a number
+
+      idx_int = as.integer(idx)
+      if (idx_int > new_idx) {
+        new_idx = idx_int
       }
     }
 
@@ -83,7 +83,9 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run",
   jsonlite::stream_out(
     get_history(instance),
     con,
-    auto_unbox = FALSE, pretty = TRUE, null = "list", na = "null",
+    # objectives must be a list, so do not auto unbox if a list has only one entry
+    auto_unbox = FALSE,
+    pretty = TRUE, null = "list", na = "null",
     dataframe = "values",
     verbose = FALSE
   )
@@ -103,19 +105,22 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run",
 # Prepare the list for converting to `configs.json`
 get_configs = function(instance){
   param_ids = instance$search_space$data$id
+
   # skip branch.selection if there is only one level
+  id = NULL # resolve global variable note in R CDM check
   nbranches = instance$search_space$data[id == "branch.selection", "nlevels", with = FALSE]
   if (nbranches == 1) {
     param_ids = setdiff(param_ids, "branch.selection")
   }
 
   config_table = instance$archive$data[, param_ids, with = FALSE]
+  # param values in deepcave are on the original scale, not the log scale
   logscale_params = param_ids[instance$search_space$is_logscale[param_ids]]
   config_table[, (logscale_params) := lapply(.SD, exp), .SDcols = logscale_params]
 
   configs_list = map(seq_len(nrow(config_table)), function(i) {
     discard(as.list(config_table[i, ]), is.na)
-  })  
+  })
   names(configs_list) = seq_along(configs_list) - 1
 
   return(configs_list)
@@ -124,7 +129,6 @@ get_configs = function(instance){
 
 # Prepare the list for converting to `configspace.json`
 get_configspace = function(instance) {
-  n_params = nrow(instance$search_space$data)
   param_ids = instance$search_space$data$id
 
   hyperparameters_list = map(param_ids, function(param_id) {
@@ -174,6 +178,7 @@ get_configspace = function(instance) {
       q = NULL
     ))
   })
+  # skipping branch.selection results in null entries => discard them
   hyperparameters_list = discard(hyperparameters_list, is.null)
 
 
@@ -203,6 +208,7 @@ get_configspace = function(instance) {
     }
     return(list(child = child, parent = parent, type = "IN", values = cond$rhs))
   })
+  # skipping branch.selection results in null entries => discard them
   conditions_list = discard(conditions_list, is.null)
 
   return(list(

From 8d46679347dbda79eea314eae089b9c39857560b Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Mon, 19 Aug 2024 00:53:48 +0200
Subject: [PATCH 33/34] docs: save run

---
 R/save_deepcave_run.R    | 31 +++++++++++++++++++++++++++++++
 man/save_deepcave_run.Rd | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+)

diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R
index 3c4d280..9739d90 100644
--- a/R/save_deepcave_run.R
+++ b/R/save_deepcave_run.R
@@ -16,6 +16,37 @@
 #' If `FALSE` (default), creates a new subfolder to save the current run. If `TRUE`, all existing runs will be deleted.
 #' 
 #' @export
+#' @examples
+#' \dontrun{
+#' rush_plan(n_workers = 2)
+#' task = tsk("penguins")
+#' 
+#' learner1 = lrn("classif.auto",
+#'   learner_ids = c("svm", "ranger"),
+#'   small_data_size = 1,
+#'   resampling = rsmp("holdout"),
+#'   measure = msr("classif.ce"),
+#'   terminator = trm("evals", n_evals = 6)
+#' )
+#' learner1$train(task)
+#' # save to `logs/mlr3automl/run_1`
+#' save_deepcave_run(learner1$instance)
+#' 
+#' # save to `logs/mlr3automl/run`
+#' # if this folder already exists, it will be overwritten
+#' save_deepcave_run(learner1$instance, overwrite = TRUE)
+#' 
+#' learner2 = lrn("classif.auto",
+#'   learner_ids = c("catboost", "xgboost"),
+#'   small_data_size = 1,
+#'   resampling = rsmp("holdout"),
+#'   measure = msr("classif.ce"),
+#'   terminator = trm("evals", n_evals = 6)
+#' )
+#' learner2$train(task)
+#' # save to `logs/mlr3automl/run_2`
+#' save_deepcave_run(learner2$instance)
+#' }
 save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", overwrite = FALSE) {
   # don't save untuned instance
   if (is.null(instance$result_learner_param_vals)) {
diff --git a/man/save_deepcave_run.Rd b/man/save_deepcave_run.Rd
index ab37cc8..61ca04d 100644
--- a/man/save_deepcave_run.Rd
+++ b/man/save_deepcave_run.Rd
@@ -27,3 +27,35 @@ If \code{FALSE} (default), creates a new subfolder to save the current run. If \
 \description{
 Exports information stored in a \code{TuningInstance} in a format recognized by \href{https://automl.github.io/DeepCAVE/main/index.html}{DeepCAVE} as a run. Each run is stored as a folder containing five files \code{configs.json}, \code{configspace.json}, \code{history.jsonl}, \code{meta.json}, and \code{origins.json}.
 }
+\examples{
+\dontrun{
+rush_plan(n_workers = 2)
+task = tsk("penguins")
+
+learner1 = lrn("classif.auto",
+  learner_ids = c("svm", "ranger"),
+  small_data_size = 1,
+  resampling = rsmp("holdout"),
+  measure = msr("classif.ce"),
+  terminator = trm("evals", n_evals = 6)
+)
+learner1$train(task)
+# save to `logs/mlr3automl/run_1`
+save_deepcave_run(learner1$instance)
+
+# save to `logs/mlr3automl/run`
+# if this folder already exists, it will be overwritten
+save_deepcave_run(learner1$instance, overwrite = TRUE)
+
+learner2 = lrn("classif.auto",
+  learner_ids = c("catboost", "xgboost"),
+  small_data_size = 1,
+  resampling = rsmp("holdout"),
+  measure = msr("classif.ce"),
+  terminator = trm("evals", n_evals = 6)
+)
+learner2$train(task)
+# save to `logs/mlr3automl/run_2`
+save_deepcave_run(learner2$instance)
+}
+}

From cf7279a0ee2957cdd1f652028b6491678293681c Mon Sep 17 00:00:00 2001
From: b-zhou <Baisu.Zhou@campus.lmu.de>
Date: Mon, 19 Aug 2024 00:54:11 +0200
Subject: [PATCH 34/34] fix: create dir

---
 R/save_deepcave_run.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R
index 9739d90..c71745e 100644
--- a/R/save_deepcave_run.R
+++ b/R/save_deepcave_run.R
@@ -77,13 +77,13 @@ save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run",
     new_idx = new_idx + 1
     run_path = file.path(path, paste0(prefix, "_", new_idx))
 
-    dir.create(run_path)
+    dir.create(run_path, recursive = TRUE)
   } else {
     run_path = file.path(path, prefix)
     if (file.exists(run_path)) {
       lapply(list.files(run_path, full.names = TRUE), file.remove)
     } else{
-      dir.create(run_path)
+      dir.create(run_path, recursive = TRUE)
     }
   }