refactor: add tuning changes (#59)

* refactor: rewrite auto fselector function and class * refactor: objective * fix: task clone * fix: parameters * refactor: objective * docs: rebuild * fix: as.data.table * docs: update * docs: update examples * docs: update
mlr-org · Nov 15, 2022 · 55f1735 · 55f1735
1 parent c851ef0
commit 55f1735
Show file tree

Hide file tree

Showing 52 changed files with 1,306 additions and 892 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -54,30 +54,30 @@ Language: en-US
 NeedsCompilation: no
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.2.1
-Collate:
-    'assertions.R'
-    'AutoFSelector.R'
+Collate: 
     'ArchiveFSelect.R'
-    'ObjectiveFSelect.R'
-    'helper.R'
+    'AutoFSelector.R'
+    'FSelectInstanceSingleCrit.R'
+    'FSelectInstanceMultiCrit.R'
     'mlr_fselectors.R'
-    'auto_fselector.R'
-    'extract_inner_fselect_archives.R'
-    'extract_inner_fselect_results.R'
-    'fselect.R'
-    'fselect_nested.R'
     'FSelector.R'
-    'FSelectorFromOptimizer.R'
+    'FSelectorDesignPoints.R'
     'FSelectorExhaustiveSearch.R'
+    'FSelectorFromOptimizer.R'
+    'FSelectorGeneticSearch.R'
     'FSelectorRFE.R'
     'FSelectorRandomSearch.R'
     'FSelectorSequential.R'
     'FSelectorShadowVariableSearch.R'
-    'FSelectorDesignPoints.R'
-    'FSelectorGeneticSearch.R'
-    'FSelectInstanceMultiCrit.R'
-    'FSelectInstanceSingleCrit.R'
+    'ObjectiveFSelect.R'
+    'assertions.R'
+    'auto_fselector.R'
+    'bibentries.R'
+    'extract_inner_fselect_archives.R'
+    'extract_inner_fselect_results.R'
+    'fselect.R'
+    'fselect_nested.R'
+    'helper.R'
     'reexports.R'
     'sugar.R'
-    'bibentries.R'
     'zzz.R'
diff --git a/NAMESPACE b/NAMESPACE
@@ -26,6 +26,7 @@ export(extract_inner_fselect_results)
 export(fs)
 export(fselect)
 export(fselect_nested)
+export(fsi)
 export(fss)
 export(mlr_fselectors)
 export(mlr_terminators)

diff --git a/NEWS.md b/NEWS.md
@@ -4,6 +4,8 @@
 * refactor: The `AutoFSelector` stores the instance and benchmark result if `store_models = TRUE`.
 * refactor: The `AutoFSelector` stores the instance if `store_benchmark_result = TRUE`.
 * feat: Add missing parameters from `AutoFSelector` to `auto_fselect()`.
+* feat: Add `fsi()` function to create a `FSelectInstanceSingleCrit` or `FSelectInstanceMultiCrit`.
+* refactor: Remove `unnest` option from `as.data.table.ArchiveFSelect()` function.
 
 # mlr3fselect 0.7.2
 

diff --git a/R/ArchiveFSelect.R b/R/ArchiveFSelect.R
@@ -1,8 +1,16 @@
-#' @title Logging Object for Evaluated Feature Sets
+#' @title Class for Logging Evaluated Feature Sets
 #'
 #' @description
-#' Container around a [data.table::data.table()] which stores all evaluated
-#' feature sets and performance scores.
+#' The [ArchiveFSelect] stores all evaluated feature sets and performance scores.
+#'
+#' @details
+#' The [ArchiveFSelect] is a container around a [data.table::data.table()].
+#' Each row corresponds to a single evaluation of a feature set.
+#' See the section on Data Structure for more information.
+#' The archive stores additionally a [mlr3::BenchmarkResult] (`$benchmark_result`) that records the resampling experiments.
+#' Each experiment corresponds to to a single evaluation of a feature set.
+#' The table (`$data`) and the benchmark result (`$benchmark_result`) are linked by the `uhash` column.
+#' If the archive is passed to `as.data.table()`, both are joined automatically.
 #'
 #' @section Data structure:
 #'
@@ -11,52 +19,33 @@
 #' * One column for each feature of the task (`$search_space`).
 #' * One column for each performance measure (`$codomain`).
 #' * `runtime_learners` (`numeric(1)`)\cr
-#'   Sum of training and predict times logged in learners per
-#'   [mlr3::ResampleResult] / evaluation. This does not include potential
-#'   overhead time.
+#'   Sum of training and predict times logged in learners per [mlr3::ResampleResult] / evaluation.
+#'   This does not include potential overhead time.
 #' * `timestamp` (`POSIXct`)\cr
 #'   Time stamp when the evaluation was logged into the archive.
 #' * `batch_nr` (`integer(1)`)\cr
-#'   Feature sets are evaluated in batches. Each batch has a unique batch
-#'   number.
+#'   Feature sets are evaluated in batches. Each batch has a unique batch number.
 #' * `uhash` (`character(1)`)\cr
-#'   Connects each feature set to the resampling experiment
-#'   stored in the [mlr3::BenchmarkResult].
-#'
-#' Each row corresponds to a single evaluation of a feature set.
-#'
-#' The archive stores additionally a [mlr3::BenchmarkResult]
-#' (`$benchmark_result`) that records the resampling experiments. Each
-#' experiment corresponds to to a single evaluation of a feature set. The table
-#' (`$data`) and the benchmark result (`$benchmark_result`) are linked by the
-#' `uhash` column. If the results are viewed with `as.data.table()`, both are
-#' joined automatically.
+#'   Connects each feature set to the resampling experiment stored in the [mlr3::BenchmarkResult].
 #'
 #' @section Analysis:
-#'
-#' For analyzing the feature selection results, it is recommended to pass the archive to
-#' `as.data.table()`. The returned data table is joined with the benchmark result
-#' which adds the [mlr3::ResampleResult] for each feature set.
+#' For analyzing the feature selection results, it is recommended to pass the archive to `as.data.table()`.
+#' The returned data table is joined with the benchmark result which adds the [mlr3::ResampleResult] for each feature set.
 #'
 #' The archive provides various getters (e.g. `$learners()`) to ease the access.
-#' All getters extract by position (`i`) or unique hash (`uhash`). For a
-#' complete list of all getters see the methods section.
+#' All getters extract by position (`i`) or unique hash (`uhash`).
+#' For a complete list of all getters see the methods section.
 #'
-#' The benchmark result (`$benchmark_result`) allows to score the feature sets
-#' again on a different measure. Alternatively, measures can be supplied to
-#' `as.data.table()`.
+#' The benchmark result (`$benchmark_result`) allows to score the feature sets again on a different measure.
+#' Alternatively, measures can be supplied to `as.data.table()`.
 #'
 #' @section S3 Methods:
-#' * `as.data.table.ArchiveFSelect(x, unnest = NULL, exclude_columns = "uhash", measures = NULL)`\cr
+#' * `as.data.table.ArchiveFSelect(x, exclude_columns = "uhash", measures = NULL)`\cr
 #' Returns a tabular view of all evaluated feature sets.\cr
 #' [ArchiveFSelect] -> [data.table::data.table()]\cr
 #'     * `x` ([ArchiveFSelect])
-#'     * `unnest` (`character()`)\cr
-#'       Transforms list columns to separate columns. Set to `NULL` if no column
-#'       should be unnested.
 #'     * `exclude_columns` (`character()`)\cr
-#'       Exclude columns from table. Set to `NULL` if no column should be
-#'       excluded.
+#'       Exclude columns from table. Set to `NULL` if no column should be excluded.
 #'     * `measures` (list of [mlr3::Measure])\cr
 #'       Score feature sets on additional measures.
 #'
@@ -67,14 +56,33 @@ ArchiveFSelect = R6Class("ArchiveFSelect",
   public = list(
 
     #' @field benchmark_result ([mlr3::BenchmarkResult])\cr
-    #' Stores benchmark result.
+    #' Benchmark result.
     benchmark_result = NULL,
 
     #' @description
-    #' Retrieve [mlr3::Learner] of the i-th evaluation, by position
-    #' or by unique hash `uhash`. `i` and `uhash` are mutually exclusive.
-    #' Learner does not contain a model. Use `$learners()` to get learners with
-    #' models.
+    #' Creates a new instance of this [R6][R6::R6Class] class.
+    #'
+    #' @param search_space ([paradox::ParamSet])\cr
+    #'   Search space.
+    #'   Internally created from provided [mlr3::Task] by instance.
+    #'
+    #' @param codomain ([bbotk::Codomain])\cr
+    #'   Specifies codomain of objective function i.e. a set of performance measures.
+    #'   Internally created from provided [mlr3::Measure]s by instance.
+    #'
+    #' @param check_values (`logical(1)`)\cr
+    #'   If `TRUE` (default), hyperparameter configurations are check for validity.
+    initialize = function(search_space, codomain, check_values = TRUE) {
+      super$initialize(search_space, codomain, check_values)
+
+      # initialize empty benchmark result
+      self$benchmark_result = BenchmarkResult$new()
+    },
+
+    #' @description
+    #' Retrieve [mlr3::Learner] of the i-th evaluation, by position or by unique hash `uhash`.
+    #' `i` and `uhash` are mutually exclusive.
+    #' Learner does not contain a model. Use `$learners()` to get learners with models.
     #'
     #' @param i (`integer(1)`)\cr
     #' The iteration value to filter for.
@@ -138,20 +146,16 @@ ArchiveFSelect = R6Class("ArchiveFSelect",
 )
 
 #' @export
-as.data.table.ArchiveFSelect = function(x, ..., unnest = NULL, exclude_columns = "uhash", measures = NULL) {
+as.data.table.ArchiveFSelect = function(x, ..., exclude_columns = "uhash", measures = NULL) {
   if (nrow(x$data) == 0) return(data.table())
   # always ignore x_domain column
   exclude_columns = c("x_domain", exclude_columns)
   # default value for exclude_columns might be not present in archive
-  if (is.null(x$benchmark_result)) exclude_columns = exclude_columns[exclude_columns %nin% "uhash"]
-
-  assert_subset(unnest, names(x$data))
+  if (!x$benchmark_result$n_resample_results) exclude_columns = exclude_columns[exclude_columns %nin% "uhash"]
   cols_y_extra = NULL
+  tab = copy(x$data)
 
-  # unnest data
-  tab = unnest(copy(x$data), unnest, prefix = "{col}_")
-
-  if (!is.null(x$benchmark_result)) {
+  if (x$benchmark_result$n_resample_results) {
     # add extra measures
     if (!is.null(measures)) {
       measures = assert_measures(as_measures(measures), learner = x$learners(1)[[1]], task = x$resample_result(1)$task)

diff --git a/R/AutoFSelector.R b/R/AutoFSelector.R
@@ -1,4 +1,4 @@
-#' @title AutoFSelector
+#' @title Class for Automatic Feature Selection
 #'
 #' @description
 #' The [AutoFSelector] wraps a [mlr3::Learner] and augments it with an automatic feature selection.
@@ -34,26 +34,28 @@
 #'
 #' @export
 #' @examples
-#' # Automafsic Feafsure Selection
+#' # Automatic Feature Selection
+#' \donttest{
 #'
+#' # split to train and external set
 #' task = tsk("penguins")
-#' train_set = sample(task$nrow, 0.8 * task$nrow)
-#' test_set = setdiff(seq_len(task$nrow), train_set)
+#' split = partition(task, ratio = 0.8)
 #'
+#' # create auto fselector
 #' afs = auto_fselector(
 #'   method = fs("random_search"),
 #'   learner = lrn("classif.rpart"),
 #'   resampling = rsmp ("holdout"),
 #'   measure = msr("classif.ce"),
 #'   term_evals = 4)
 #'
-#' # optimize feafsure subset and fit final model
-#' afs$train(task, row_ids = train_set)
+#' # optimize feature subset and fit final model
+#' afs$train(task, row_ids = split$train)
 #'
 #' # predict with final model
-#' afs$predict(task, row_ids = test_set)
+#' afs$predict(task, row_ids = split$test)
 #'
-#' # show fselect result
+#' # show result
 #' afs$fselect_result
 #'
 #' # model slot contains trained learner and fselect instance
@@ -84,8 +86,9 @@
 #' # performance scores estimated on the outer resampling
 #' rr$score()
 #'
-#' # unbiased performance of the final model trained on the full dafsa set
+#' # unbiased performance of the final model trained on the full data set
 #' rr$aggregate()
+#' }
 AutoFSelector = R6Class("AutoFSelector",
   inherit = Learner,
   public = list(