From 55f17350e1d478ca3d4fae5e016d7363adaf23e1 Mon Sep 17 00:00:00 2001 From: Marc Becker <33069354+be-marc@users.noreply.github.com> Date: Tue, 15 Nov 2022 17:54:24 +0100 Subject: [PATCH] refactor: add tuning changes (#59) * refactor: rewrite auto fselector function and class * refactor: objective * fix: task clone * fix: parameters * refactor: objective * docs: rebuild * fix: as.data.table * docs: update * docs: update examples * docs: update --- DESCRIPTION | 32 ++--- NAMESPACE | 1 + NEWS.md | 2 + R/ArchiveFSelect.R | 98 +++++++-------- R/AutoFSelector.R | 21 ++-- R/FSelectInstanceMultiCrit.R | 102 ++++++++-------- R/FSelectInstanceSingleCrit.R | 112 ++++++++++-------- R/FSelector.R | 94 +++++++-------- R/FSelectorDesignPoints.R | 31 ++--- R/FSelectorExhaustiveSearch.R | 13 +- R/FSelectorGeneticSearch.R | 19 +-- R/FSelectorRFE.R | 25 ++-- R/FSelectorRandomSearch.R | 29 +++-- R/FSelectorSequential.R | 15 ++- R/FSelectorShadowVariableSearch.R | 18 +-- R/ObjectiveFSelect.R | 95 +++++++-------- R/auto_fselector.R | 10 +- R/extract_inner_fselect_archives.R | 35 +++--- R/extract_inner_fselect_results.R | 36 ++++-- R/fselect.R | 54 +++++++-- R/fselect_nested.R | 9 +- R/helper.R | 10 +- R/sugar.R | 44 ++++++- man-roxygen/example.R | 14 +-- man-roxygen/section_dictionary_fselectors.R | 4 +- man/ArchiveFSelect.Rd | 96 ++++++++------- man/AutoFSelector.Rd | 21 ++-- man/FSelectInstanceMultiCrit.Rd | 99 ++++++++-------- man/FSelectInstanceSingleCrit.Rd | 107 +++++++++-------- man/FSelector.Rd | 75 +++++------- man/ObjectiveFSelect.Rd | 25 ++-- man/auto_fselector.Rd | 53 ++++++++- man/extract_inner_fselect_archives.Rd | 24 ++-- man/extract_inner_fselect_results.Rd | 27 +++-- man/fs.Rd | 18 ++- man/fselect.Rd | 70 +++++++++-- man/fselect_nested.Rd | 7 +- man/fsi.Rd | 86 ++++++++++++++ man/mlr_fselectors.Rd | 9 ++ man/mlr_fselectors_design_points.Rd | 48 ++++---- man/mlr_fselectors_exhaustive_search.Rd | 44 ++++--- man/mlr_fselectors_genetic_search.Rd | 48 ++++---- man/mlr_fselectors_random_search.Rd | 46 ++++--- man/mlr_fselectors_rfe.Rd | 40 ++++--- man/mlr_fselectors_sequential.Rd | 40 ++++--- man/mlr_fselectors_shadow_variable_search.Rd | 35 ++++-- tests/testthat/test_ArchiveFSelect.R | 32 ++--- tests/testthat/test_AutoFSelector.R | 2 +- tests/testthat/test_ObjectiveFSelect.R | 16 +-- ... => test_extract_inner_fselect_archives.R} | 86 +++++++------- .../test_extract_inner_fselect_result.R | 102 +++++++++------- tests/testthat/test_fsi.R | 19 +++ 52 files changed, 1306 insertions(+), 892 deletions(-) create mode 100644 man/fsi.Rd rename tests/testthat/{extract_inner_fselect_archives.R => test_extract_inner_fselect_archives.R} (55%) create mode 100644 tests/testthat/test_fsi.R diff --git a/DESCRIPTION b/DESCRIPTION index 336c8172..55d30bd0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -54,30 +54,30 @@ Language: en-US NeedsCompilation: no Roxygen: list(markdown = TRUE) RoxygenNote: 7.2.1 -Collate: - 'assertions.R' - 'AutoFSelector.R' +Collate: 'ArchiveFSelect.R' - 'ObjectiveFSelect.R' - 'helper.R' + 'AutoFSelector.R' + 'FSelectInstanceSingleCrit.R' + 'FSelectInstanceMultiCrit.R' 'mlr_fselectors.R' - 'auto_fselector.R' - 'extract_inner_fselect_archives.R' - 'extract_inner_fselect_results.R' - 'fselect.R' - 'fselect_nested.R' 'FSelector.R' - 'FSelectorFromOptimizer.R' + 'FSelectorDesignPoints.R' 'FSelectorExhaustiveSearch.R' + 'FSelectorFromOptimizer.R' + 'FSelectorGeneticSearch.R' 'FSelectorRFE.R' 'FSelectorRandomSearch.R' 'FSelectorSequential.R' 'FSelectorShadowVariableSearch.R' - 'FSelectorDesignPoints.R' - 'FSelectorGeneticSearch.R' - 'FSelectInstanceMultiCrit.R' - 'FSelectInstanceSingleCrit.R' + 'ObjectiveFSelect.R' + 'assertions.R' + 'auto_fselector.R' + 'bibentries.R' + 'extract_inner_fselect_archives.R' + 'extract_inner_fselect_results.R' + 'fselect.R' + 'fselect_nested.R' + 'helper.R' 'reexports.R' 'sugar.R' - 'bibentries.R' 'zzz.R' diff --git a/NAMESPACE b/NAMESPACE index 0ba34852..7c62f04d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -26,6 +26,7 @@ export(extract_inner_fselect_results) export(fs) export(fselect) export(fselect_nested) +export(fsi) export(fss) export(mlr_fselectors) export(mlr_terminators) diff --git a/NEWS.md b/NEWS.md index c5a43846..f73a5f00 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,8 @@ * refactor: The `AutoFSelector` stores the instance and benchmark result if `store_models = TRUE`. * refactor: The `AutoFSelector` stores the instance if `store_benchmark_result = TRUE`. * feat: Add missing parameters from `AutoFSelector` to `auto_fselect()`. +* feat: Add `fsi()` function to create a `FSelectInstanceSingleCrit` or `FSelectInstanceMultiCrit`. +* refactor: Remove `unnest` option from `as.data.table.ArchiveFSelect()` function. # mlr3fselect 0.7.2 diff --git a/R/ArchiveFSelect.R b/R/ArchiveFSelect.R index 860a72b9..5157b91c 100644 --- a/R/ArchiveFSelect.R +++ b/R/ArchiveFSelect.R @@ -1,8 +1,16 @@ -#' @title Logging Object for Evaluated Feature Sets +#' @title Class for Logging Evaluated Feature Sets #' #' @description -#' Container around a [data.table::data.table()] which stores all evaluated -#' feature sets and performance scores. +#' The [ArchiveFSelect] stores all evaluated feature sets and performance scores. +#' +#' @details +#' The [ArchiveFSelect] is a container around a [data.table::data.table()]. +#' Each row corresponds to a single evaluation of a feature set. +#' See the section on Data Structure for more information. +#' The archive stores additionally a [mlr3::BenchmarkResult] (`$benchmark_result`) that records the resampling experiments. +#' Each experiment corresponds to to a single evaluation of a feature set. +#' The table (`$data`) and the benchmark result (`$benchmark_result`) are linked by the `uhash` column. +#' If the archive is passed to `as.data.table()`, both are joined automatically. #' #' @section Data structure: #' @@ -11,52 +19,33 @@ #' * One column for each feature of the task (`$search_space`). #' * One column for each performance measure (`$codomain`). #' * `runtime_learners` (`numeric(1)`)\cr -#' Sum of training and predict times logged in learners per -#' [mlr3::ResampleResult] / evaluation. This does not include potential -#' overhead time. +#' Sum of training and predict times logged in learners per [mlr3::ResampleResult] / evaluation. +#' This does not include potential overhead time. #' * `timestamp` (`POSIXct`)\cr #' Time stamp when the evaluation was logged into the archive. #' * `batch_nr` (`integer(1)`)\cr -#' Feature sets are evaluated in batches. Each batch has a unique batch -#' number. +#' Feature sets are evaluated in batches. Each batch has a unique batch number. #' * `uhash` (`character(1)`)\cr -#' Connects each feature set to the resampling experiment -#' stored in the [mlr3::BenchmarkResult]. -#' -#' Each row corresponds to a single evaluation of a feature set. -#' -#' The archive stores additionally a [mlr3::BenchmarkResult] -#' (`$benchmark_result`) that records the resampling experiments. Each -#' experiment corresponds to to a single evaluation of a feature set. The table -#' (`$data`) and the benchmark result (`$benchmark_result`) are linked by the -#' `uhash` column. If the results are viewed with `as.data.table()`, both are -#' joined automatically. +#' Connects each feature set to the resampling experiment stored in the [mlr3::BenchmarkResult]. #' #' @section Analysis: -#' -#' For analyzing the feature selection results, it is recommended to pass the archive to -#' `as.data.table()`. The returned data table is joined with the benchmark result -#' which adds the [mlr3::ResampleResult] for each feature set. +#' For analyzing the feature selection results, it is recommended to pass the archive to `as.data.table()`. +#' The returned data table is joined with the benchmark result which adds the [mlr3::ResampleResult] for each feature set. #' #' The archive provides various getters (e.g. `$learners()`) to ease the access. -#' All getters extract by position (`i`) or unique hash (`uhash`). For a -#' complete list of all getters see the methods section. +#' All getters extract by position (`i`) or unique hash (`uhash`). +#' For a complete list of all getters see the methods section. #' -#' The benchmark result (`$benchmark_result`) allows to score the feature sets -#' again on a different measure. Alternatively, measures can be supplied to -#' `as.data.table()`. +#' The benchmark result (`$benchmark_result`) allows to score the feature sets again on a different measure. +#' Alternatively, measures can be supplied to `as.data.table()`. #' #' @section S3 Methods: -#' * `as.data.table.ArchiveFSelect(x, unnest = NULL, exclude_columns = "uhash", measures = NULL)`\cr +#' * `as.data.table.ArchiveFSelect(x, exclude_columns = "uhash", measures = NULL)`\cr #' Returns a tabular view of all evaluated feature sets.\cr #' [ArchiveFSelect] -> [data.table::data.table()]\cr #' * `x` ([ArchiveFSelect]) -#' * `unnest` (`character()`)\cr -#' Transforms list columns to separate columns. Set to `NULL` if no column -#' should be unnested. #' * `exclude_columns` (`character()`)\cr -#' Exclude columns from table. Set to `NULL` if no column should be -#' excluded. +#' Exclude columns from table. Set to `NULL` if no column should be excluded. #' * `measures` (list of [mlr3::Measure])\cr #' Score feature sets on additional measures. #' @@ -67,14 +56,33 @@ ArchiveFSelect = R6Class("ArchiveFSelect", public = list( #' @field benchmark_result ([mlr3::BenchmarkResult])\cr - #' Stores benchmark result. + #' Benchmark result. benchmark_result = NULL, #' @description - #' Retrieve [mlr3::Learner] of the i-th evaluation, by position - #' or by unique hash `uhash`. `i` and `uhash` are mutually exclusive. - #' Learner does not contain a model. Use `$learners()` to get learners with - #' models. + #' Creates a new instance of this [R6][R6::R6Class] class. + #' + #' @param search_space ([paradox::ParamSet])\cr + #' Search space. + #' Internally created from provided [mlr3::Task] by instance. + #' + #' @param codomain ([bbotk::Codomain])\cr + #' Specifies codomain of objective function i.e. a set of performance measures. + #' Internally created from provided [mlr3::Measure]s by instance. + #' + #' @param check_values (`logical(1)`)\cr + #' If `TRUE` (default), hyperparameter configurations are check for validity. + initialize = function(search_space, codomain, check_values = TRUE) { + super$initialize(search_space, codomain, check_values) + + # initialize empty benchmark result + self$benchmark_result = BenchmarkResult$new() + }, + + #' @description + #' Retrieve [mlr3::Learner] of the i-th evaluation, by position or by unique hash `uhash`. + #' `i` and `uhash` are mutually exclusive. + #' Learner does not contain a model. Use `$learners()` to get learners with models. #' #' @param i (`integer(1)`)\cr #' The iteration value to filter for. @@ -138,20 +146,16 @@ ArchiveFSelect = R6Class("ArchiveFSelect", ) #' @export -as.data.table.ArchiveFSelect = function(x, ..., unnest = NULL, exclude_columns = "uhash", measures = NULL) { +as.data.table.ArchiveFSelect = function(x, ..., exclude_columns = "uhash", measures = NULL) { if (nrow(x$data) == 0) return(data.table()) # always ignore x_domain column exclude_columns = c("x_domain", exclude_columns) # default value for exclude_columns might be not present in archive - if (is.null(x$benchmark_result)) exclude_columns = exclude_columns[exclude_columns %nin% "uhash"] - - assert_subset(unnest, names(x$data)) + if (!x$benchmark_result$n_resample_results) exclude_columns = exclude_columns[exclude_columns %nin% "uhash"] cols_y_extra = NULL + tab = copy(x$data) - # unnest data - tab = unnest(copy(x$data), unnest, prefix = "{col}_") - - if (!is.null(x$benchmark_result)) { + if (x$benchmark_result$n_resample_results) { # add extra measures if (!is.null(measures)) { measures = assert_measures(as_measures(measures), learner = x$learners(1)[[1]], task = x$resample_result(1)$task) diff --git a/R/AutoFSelector.R b/R/AutoFSelector.R index 63187d0e..33393f99 100644 --- a/R/AutoFSelector.R +++ b/R/AutoFSelector.R @@ -1,4 +1,4 @@ -#' @title AutoFSelector +#' @title Class for Automatic Feature Selection #' #' @description #' The [AutoFSelector] wraps a [mlr3::Learner] and augments it with an automatic feature selection. @@ -34,12 +34,14 @@ #' #' @export #' @examples -#' # Automafsic Feafsure Selection +#' # Automatic Feature Selection +#' \donttest{ #' +#' # split to train and external set #' task = tsk("penguins") -#' train_set = sample(task$nrow, 0.8 * task$nrow) -#' test_set = setdiff(seq_len(task$nrow), train_set) +#' split = partition(task, ratio = 0.8) #' +#' # create auto fselector #' afs = auto_fselector( #' method = fs("random_search"), #' learner = lrn("classif.rpart"), @@ -47,13 +49,13 @@ #' measure = msr("classif.ce"), #' term_evals = 4) #' -#' # optimize feafsure subset and fit final model -#' afs$train(task, row_ids = train_set) +#' # optimize feature subset and fit final model +#' afs$train(task, row_ids = split$train) #' #' # predict with final model -#' afs$predict(task, row_ids = test_set) +#' afs$predict(task, row_ids = split$test) #' -#' # show fselect result +#' # show result #' afs$fselect_result #' #' # model slot contains trained learner and fselect instance @@ -84,8 +86,9 @@ #' # performance scores estimated on the outer resampling #' rr$score() #' -#' # unbiased performance of the final model trained on the full dafsa set +#' # unbiased performance of the final model trained on the full data set #' rr$aggregate() +#' } AutoFSelector = R6Class("AutoFSelector", inherit = Learner, public = list( diff --git a/R/FSelectInstanceMultiCrit.R b/R/FSelectInstanceMultiCrit.R index 405968a3..0a115165 100644 --- a/R/FSelectInstanceMultiCrit.R +++ b/R/FSelectInstanceMultiCrit.R @@ -1,23 +1,14 @@ -#' @title Multi Criterion Feature Selection Instance +#' @title Class for Multi Criteria Feature Selection #' -#' @description -#' Specifies a general feature selection scenario, including objective function -#' and archive for feature selection algorithms to act upon. This class stores -#' an [ObjectiveFSelect] object that encodes the black box objective function -#' which an [FSelector] has to optimize. It allows the basic operations of -#' querying the objective at feature subsets (`$eval_batch()`), storing the -#' evaluations in the internal [bbotk::Archive] and accessing the final result -#' (`$result`). +#' @include FSelectInstanceSingleCrit.R ArchiveFSelect.R #' -#' Evaluations of feature subsets are performed in batches by calling -#' [mlr3::benchmark()] internally. Before a batch is evaluated, the -#' [bbotk::Terminator] is queried for the remaining budget. If the available -#' budget is exhausted, an exception is raised, and no further evaluations can -#' be performed from this point on. +#' @description +#' The [FSelectInstanceMultiCrit] specifies a feature selection problem for [FSelectors][FSelector]. +#' The function [fsi()] creates a [FSelectInstanceMultiCrit] and the function [fselect()] creates an instance internally. #' -#' The [FSelector] is also supposed to store its final result, consisting -#' of the selected feature subsets and associated estimated performance values, by -#' calling the method `instance$assign_result()`. +#' @inherit FSelectInstanceSingleCrit details +#' @inheritSection FSelectInstanceSingleCrit Resources +#' @inheritSection ArchiveFSelect Analysis #' #' @template param_task #' @template param_learner @@ -31,63 +22,66 @@ #' #' @export #' @examples -#' library(mlr3) -#' library(data.table) -#' -#' # Objects required to define the performance evaluator -#' task = tsk("iris") -#' measures = msrs(c("classif.ce", "classif.acc")) -#' learner = lrn("classif.rpart") -#' resampling = rsmp("cv") -#' terminator = trm("evals", n_evals = 8) +#' # Feature selection on Palmer Penguins data set +#' task = tsk("penguins") #' -#' inst = FSelectInstanceMultiCrit$new( +#' # Construct feature selection instance +#' instance = fsi( #' task = task, -#' learner = learner, -#' resampling = resampling, -#' measures = measures, -#' terminator = terminator +#' learner = lrn("classif.rpart"), +#' resampling = rsmp("cv", folds = 3), +#' measures = msrs(c("classif.ce", "time_train")), +#' terminator = trm("evals", n_evals = 4) #' ) #' -#' # Try some feature subsets -#' xdt = data.table( -#' Petal.Length = c(TRUE, FALSE), -#' Petal.Width = c(FALSE, TRUE), -#' Sepal.Length = c(TRUE, FALSE), -#' Sepal.Width = c(FALSE, TRUE) -#' ) +#' # Choose optimization algorithm +#' fselector = fs("random_search", batch_size = 2) +#' +#' # Run feature selection +#' fselector$optimize(instance) #' -#' inst$eval_batch(xdt) +#' # Optimal feature sets +#' instance$result_feature_set #' -#' # Get archive data -#' as.data.table(inst$archive) +#' # Inspect all evaluated sets +#' as.data.table(instance$archive) FSelectInstanceMultiCrit = R6Class("FSelectInstanceMultiCrit", inherit = OptimInstanceMultiCrit, public = list( #' @description #' Creates a new instance of this [R6][R6::R6Class] class. - initialize = function(task, learner, resampling, measures, terminator, - store_models = FALSE, check_values = TRUE, store_benchmark_result = TRUE) { - obj = ObjectiveFSelect$new(task = task, learner = learner, - resampling = resampling, measures = measures, + initialize = function(task, learner, resampling, measures, terminator, store_benchmark_result = TRUE, store_models = FALSE, check_values = FALSE) { + # initialized specialized fselect archive and objective + archive = ArchiveFSelect$new( + search_space = task_to_domain(assert_task(task)), + codomain = measures_to_codomain(assert_measures(measures)), + check_values = check_values) + + objective = ObjectiveFSelect$new( + task = task, + learner = learner, + resampling = resampling, + measures = measures, store_benchmark_result = store_benchmark_result, - store_models = store_models, check_values = check_values) - super$initialize(obj, obj$domain, terminator) + store_models = store_models, + check_values = check_values, + archive = archive) - self$archive = ArchiveFSelect$new(search_space = self$objective$domain, codomain = self$objective$codomain, - check_values = check_values) - self$objective$archive = self$archive + super$initialize(objective, objective$domain, terminator) + + # super class of instance initializes default archive, overwrite with fselect archive + self$archive = archive private$.objective_function = objective_function }, #' @description - #' The [FSelector] object writes the best found feature subsets - #' and estimated performance values here. For internal use. + #' The [FSelector] object writes the best found feature subsets and estimated performance values here. + #' For internal use. #' #' @param ydt (`data.table::data.table()`)\cr - #' Optimal outcomes, e.g. the Pareto front. + #' Optimal outcomes, e.g. the Pareto front. assign_result = function(xdt, ydt) { # Add feature names to result for easy task subsetting features = map(transpose_list(xdt), function(x) { @@ -103,7 +97,7 @@ FSelectInstanceMultiCrit = R6Class("FSelectInstanceMultiCrit", ), active = list( - #' @field result_feature_set (`list()` of `character()`)\cr + #' @field result_feature_set (list of `character()`)\cr #' Feature sets for task subsetting. result_feature_set = function() { map(self$result$features, function(x) { diff --git a/R/FSelectInstanceSingleCrit.R b/R/FSelectInstanceSingleCrit.R index 16ec368d..c1a2060e 100644 --- a/R/FSelectInstanceSingleCrit.R +++ b/R/FSelectInstanceSingleCrit.R @@ -1,23 +1,26 @@ -#' @title Single Criterion Feature Selection Instance +#' @title Class for Single Criterion Feature Selection +#' +#' @include ArchiveFSelect.R +#' +#' @description +#' The [FSelectInstanceSingleCrit] specifies a feature selection problem for [FSelectors][FSelector]. +#' The function [fsi()] creates a [FSelectInstanceSingleCrit] and the function [fselect()] creates an instance internally. #' #' @description -#' Specifies a general feature selection scenario, including objective function -#' and archive for feature selection algorithms to act upon. This class stores -#' an [ObjectiveFSelect] object that encodes the black box objective function -#' which an [FSelector] has to optimize. It allows the basic operations of -#' querying the objective at feature subsets (`$eval_batch()`), storing the -#' evaluations in the internal [bbotk::Archive] and accessing the final result -#' (`$result`). +#' The instance contains an [ObjectiveFSelect] object that encodes the black box objective function a [FSelector] has to optimize. +#' The instance allows the basic operations of querying the objective at design points (`$eval_batch()`). +#' This operation is usually done by the [FSelector]. +#' Evaluations of feature subsets are performed in batches by calling [mlr3::benchmark()] internally. +#' The evaluated feature subsets are stored in the [Archive][ArchiveFSelect] (`$archive`). +#' Before a batch is evaluated, the [bbotk::Terminator] is queried for the remaining budget. +#' If the available budget is exhausted, an exception is raised, and no further evaluations can be performed from this point on. +#' The [FSelector] is also supposed to store its final result, consisting of a selected feature subset and associated estimated performance values, by calling the method `instance$assign_result()`. #' -#' Evaluations of feature subsets are performed in batches by calling -#' [mlr3::benchmark()] internally. Before a batch is evaluated, the -#' [bbotk::Terminator] is queried for the remaining budget. If the available -#' budget is exhausted, an exception is raised, and no further evaluations can -#' be performed from this point on. +#' @inheritSection ArchiveFSelect Analysis #' -#' The [FSelector] is also supposed to store its final result, consisting -#' of a selected feature subset and associated estimated performance values, by -#' calling the method `instance$assign_result()`. +#' @section Resources: +#' * [book chapter](https://mlr3book.mlr-org.com/feature-selection.html#fs-wrapper) on feature selection. +#' * [gallery post](https://mlr-org.com/gallery/2020-09-14-mlr3fselect-basic/) on feature selection on the Titanic data set. #' #' @template param_task #' @template param_learner @@ -31,63 +34,70 @@ #' #' @export #' @examples -#' library(mlr3) -#' library(data.table) -#' -#' # Objects required to define the objective function -#' task = tsk("iris") -#' measure = msr("classif.ce") +#' # Feature selection on Palmer Penguins data set +#' task = tsk("penguins") #' learner = lrn("classif.rpart") -#' resampling = rsmp("cv") #' -#' # Create instance -#' terminator = trm("evals", n_evals = 8) -#' inst = FSelectInstanceSingleCrit$new( +#' # Construct feature selection instance +#' instance = fsi( #' task = task, #' learner = learner, -#' resampling = resampling, -#' measure = measure, -#' terminator = terminator +#' resampling = rsmp("cv", folds = 3), +#' measures = msr("classif.ce"), +#' terminator = trm("evals", n_evals = 4) #' ) #' -#' # Try some feature subsets -#' xdt = data.table( -#' Petal.Length = c(TRUE, FALSE), -#' Petal.Width = c(FALSE, TRUE), -#' Sepal.Length = c(TRUE, FALSE), -#' Sepal.Width = c(FALSE, TRUE) -#' ) +#' # Choose optimization algorithm +#' fselector = fs("random_search", batch_size = 2) +#' +#' # Run feature selection +#' fselector$optimize(instance) #' -#' inst$eval_batch(xdt) +#' # Subset task to optimal feature set +#' task$select(instance$result_feature_set) #' -#' # Get archive data -#' as.data.table(inst$archive) +#' # Train the learner with optimal feature set on the full data set +#' learner$train(task) +#' +#' # Inspect all evaluated sets +#' as.data.table(instance$archive) FSelectInstanceSingleCrit = R6Class("FSelectInstanceSingleCrit", inherit = OptimInstanceSingleCrit, public = list( #' @description #' Creates a new instance of this [R6][R6::R6Class] class. - initialize = function(task, learner, resampling, measure, terminator, - store_models = FALSE, check_values = TRUE, store_benchmark_result = TRUE) { - obj = ObjectiveFSelect$new(task = task, learner = learner, - resampling = resampling, measures = measure, + initialize = function(task, learner, resampling, measure, terminator, store_benchmark_result = TRUE, store_models = FALSE, check_values = FALSE) { + # initialized specialized fselect archive and objective + archive = ArchiveFSelect$new( + search_space = task_to_domain(assert_task(task)), + codomain = measures_to_codomain(assert_measure(measure)), + check_values = check_values) + + objective = ObjectiveFSelect$new( + task = task, + learner = learner, + resampling = resampling, + measures = measure, store_benchmark_result = store_benchmark_result, - store_models = store_models, check_values = check_values) - super$initialize(obj, obj$domain, terminator) + store_models = store_models, + check_values = check_values, + archive = archive) - self$archive = ArchiveFSelect$new(search_space = self$objective$domain, codomain = self$objective$codomain, - check_values = check_values) - self$objective$archive = self$archive + super$initialize(objective, objective$domain, terminator) + + # super class of instance initializes default archive, overwrite with fselect archive + self$archive = archive private$.objective_function = objective_function }, #' @description - #' The [FSelector] writes the best found feature subset - #' and estimated performance value here. For internal use. + #' The [FSelector] writes the best found feature subset and estimated performance value here. + #' For internal use. + #' #' @param y (`numeric(1)`)\cr - #' Optimal outcome. + #' Optimal outcome. assign_result = function(xdt, y) { # Add feature names to result for easy task subsetting features = list(self$objective$task$feature_names[as.logical(xdt)]) diff --git a/R/FSelector.R b/R/FSelector.R index 5f95ede4..a301dbf5 100644 --- a/R/FSelector.R +++ b/R/FSelector.R @@ -1,56 +1,42 @@ -#' @title FSelector +#' @title Class for Feature Selection Algorithms +#' +#' @include mlr_fselectors.R #' #' @description -#' Abstract `FSelector` class that implements the base functionality each -#' fselector must provide. A `FSelector` object describes the feature selection -#' strategy, i.e. how to optimize the black-box function and its feasible set -#' defined by the [FSelectInstanceSingleCrit] / [FSelectInstanceMultiCrit] object. +#' The [FSelector] implements the optimization algorithm. #' -#' A fselector must write its result into the [FSelectInstanceSingleCrit] / -#' [FSelectInstanceMultiCrit] using the `assign_result` method of the -#' [bbotk::OptimInstance] at the end of its selection in order to store the best -#' selected feature subset and its estimated performance vector. +#' @details +#' [FSelector] is a abstract base class that implements the base functionality each fselector must provide. +#' A subclass is implemented in the following way: +#' * Inherit from FSelector. +#' * Specify the private abstract method `$.optimize()` and use it to call into your optimizer. +#' * You need to call `instance$eval_batch()` to evaluate design points. +#' * The batch evaluation is requested at the [FSelectInstanceSingleCrit]/[FSelectInstanceMultiCrit] object `instance`, so each batch is possibly executed in parallel via [mlr3::benchmark()], and all evaluations are stored inside of `instance$archive`. +#' * Before the batch evaluation, the [bbotk::Terminator] is checked, and if it is positive, an exception of class `"terminated_error"` is generated. +#' In the later case the current batch of evaluations is still stored in `instance`, but the numeric scores are not sent back to the handling optimizer as it has lost execution control. +#' * After such an exception was caught we select the best set from `instance$archive` and return it. +#' * Note that therefore more points than specified by the [bbotk::Terminator] may be evaluated, as the Terminator is only checked before a batch evaluation, and not in-between evaluation in a batch. +#' How many more depends on the setting of the batch size. +#' * Overwrite the private super-method `.assign_result()` if you want to decide yourself how to estimate the final set in the instance and its estimated performance. +#' The default behavior is: We pick the best resample-experiment, regarding the given measure, then assign its set and aggregated performance to the instance. #' #' @section Private Methods: #' * `.optimize(instance)` -> `NULL`\cr -#' Abstract base method. Implement to specify feature selection of your -#' subclass. See technical details sections. +#' Abstract base method. Implement to specify feature selection of your subclass. +#' See technical details sections. #' * `.assign_result(instance)` -> `NULL`\cr -#' Abstract base method. Implement to specify how the final feature subset is -#' selected. See technical details sections. +#' Abstract base method. Implement to specify how the final feature subset is selected. +#' See technical details sections. #' -#' @section Technical Details and Subclasses: -#' A subclass is implemented in the following way: -#' * Inherit from `FSelector`. -#' * Specify the private abstract method `$.optimize()` and use it to call into -#' your optimizer. -#' * You need to call `instance$eval_batch()` to evaluate feature subsets. -#' * The batch evaluation is requested at the [FSelectInstanceSingleCrit] / -#' [FSelectInstanceMultiCrit] object `instance`, so each batch is possibly -#' executed in parallel via [mlr3::benchmark()], and all evaluations are stored -#' inside of `instance$archive`. -#' * Before the batch evaluation, the [bbotk::Terminator] is checked, and if it is -#' positive, an exception of class `"terminated_error"` is generated. In the -#' later case the current batch of evaluations is still stored in `instance`, -#' but the numeric scores are not sent back to the handling optimizer as it has -#' lost execution control. -#' * After such an exception was caught we select the best feature subset from -#' `instance$archive` and return it. -#' * Note that therefore more points than specified by the [bbotk::Terminator] -#' may be evaluated, as the Terminator is only checked before a batch -#' evaluation, and not in-between evaluation in a batch. How many more depends -#' on the setting of the batch size. -#' * Overwrite the private super-method `.assign_result()` if you want to decide -#' yourself how to estimate the final feature subset in the instance and its -#' estimated performance. The default behavior is: We pick the best -#' resample-experiment, regarding the given measure, then assign its -#' feature subset and aggregated performance to the instance. +#' @section Resources: +#' * [book section](https://mlr3book.mlr-org.com/feature-selection.html#the-fselector-class) on feature selection algorithms. #' #' @template param_man #' #' @export FSelector = R6Class("FSelector", public = list( + #' @field id (`character(1)`)\cr #' Identifier of the object. #' Used in tables, plot and text output. @@ -89,6 +75,7 @@ FSelector = R6Class("FSelector", #' @description #' Helper for print outputs. + #' #' @return (`character()`). format = function() { sprintf("<%s>", class(self)[1L]) @@ -96,6 +83,7 @@ FSelector = R6Class("FSelector", #' @description #' Print method. + #' #' @return (`character()`). print = function() { catn(format(self), if (is.na(self$label)) "" else paste0(": ", self$label)) @@ -112,15 +100,13 @@ FSelector = R6Class("FSelector", }, #' @description - #' Performs the feature selection on a [FSelectInstanceSingleCrit] or - #' [FSelectInstanceMultiCrit] until termination. - #' The single evaluations will be written into the [ArchiveFSelect] that resides in the - #' [FSelectInstanceSingleCrit] / [FSelectInstanceMultiCrit]. + #' Performs the feature selection on a [FSelectInstanceSingleCrit] or [FSelectInstanceMultiCrit] until termination. + #' The single evaluations will be written into the [ArchiveFSelect] that resides in the [FSelectInstanceSingleCrit] / [FSelectInstanceMultiCrit]. #' The result will be written into the instance object. #' - #' @param inst ([FSelectInstanceSingleCrit]|[FSelectInstanceMultiCrit]). + #' @param inst ([FSelectInstanceSingleCrit] | [FSelectInstanceMultiCrit]). #' - #' @return [data.table::data.table]. + #' @return [data.table::data.table()]. optimize = function(inst) { assert_multi_class(inst, c("FSelectInstanceSingleCrit", "FSelectInstanceMultiCrit")) optimize_default(inst, self, private) @@ -130,7 +116,7 @@ FSelector = R6Class("FSelector", active = list( #' @field param_set [paradox::ParamSet]\cr - #' Set of control parameters. + #' Set of control parameters. param_set = function(rhs) { if (!missing(rhs) && !identical(rhs, private$.param_set)) { stop("$param_set is read-only.") @@ -139,8 +125,8 @@ FSelector = R6Class("FSelector", }, #' @field properties (`character()`)\cr - #' Set of properties of the fselector. - #' Must be a subset of [`mlr_reflections$fselect_properties`][mlr3::mlr_reflections]. + #' Set of properties of the fselector. + #' Must be a subset of [`mlr_reflections$fselect_properties`][mlr3::mlr_reflections]. properties = function(rhs) { if (!missing(rhs) && !identical(rhs, private$.properties)) { stop("$properties is read-only.") @@ -149,8 +135,8 @@ FSelector = R6Class("FSelector", }, #' @field packages (`character()`)\cr - #' Set of required packages. - #' Note that these packages will be loaded via [requireNamespace()], and are not attached. + #' Set of required packages. + #' Note that these packages will be loaded via [requireNamespace()], and are not attached. packages = function(rhs) { if (!missing(rhs) && !identical(rhs, private$.packages)) { stop("$packages is read-only.") @@ -159,8 +145,8 @@ FSelector = R6Class("FSelector", }, #' @field label (`character(1)`)\cr - #' Label for this object. - #' Can be used in tables, plot and text output instead of the ID. + #' Label for this object. + #' Can be used in tables, plot and text output instead of the ID. label = function(rhs) { if (!missing(rhs) && !identical(rhs, private$.param_set)) { stop("$label is read-only.") @@ -169,8 +155,8 @@ FSelector = R6Class("FSelector", }, #' @field man (`character(1)`)\cr - #' String in the format `[pkg]::[topic]` pointing to a manual page for this object. - #' The referenced help package can be opened via method `$help()`. + #' String in the format `[pkg]::[topic]` pointing to a manual page for this object. + #' The referenced help package can be opened via method `$help()`. man = function(rhs) { if (!missing(rhs) && !identical(rhs, private$.man)) { stop("$man is read-only.") diff --git a/R/FSelectorDesignPoints.R b/R/FSelectorDesignPoints.R index 3427f9cd..70c0efa3 100644 --- a/R/FSelectorDesignPoints.R +++ b/R/FSelectorDesignPoints.R @@ -1,11 +1,14 @@ -#' @title Feature Selection via Design Points +#' @title Feature Selection with Design Points #' +#' @include mlr_fselectors.R #' @name mlr_fselectors_design_points #' #' @description -#' Design points uses feature sets specified by the user. +#' Feature selection using user-defined feature sets. #' +#' @details #' The feature sets are evaluated in order as given. +#' #' The feature selection terminates itself when all feature sets are evaluated. #' It is not necessary to set a termination criterion. #' @@ -14,18 +17,18 @@ #' #' @inheritSection bbotk::OptimizerDesignPoints Parameters #' +#' @family FSelector #' @export #' @examples -#' library(mlr3misc) +#' # Feature Selection +#' \donttest{ #' -#' # retrieve task +#' # retrieve task and load learner #' task = tsk("pima") -#' -#' # load learner #' learner = lrn("classif.rpart") #' #' # create design -#' design = rowwise_table( +#' design = mlr3misc::rowwise_table( #' ~age, ~glucose, ~insulin, ~mass, ~pedigree, ~pregnant, ~pressure, ~triceps, #' TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, #' TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, @@ -33,21 +36,19 @@ #' TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE #' ) #' -#' \donttest{ -#' # feature selection on the pima indians diabetes data set +#' # run feature selection on the Pima Indians diabetes data set #' instance = fselect( -#' method = "design_points", +#' method = fs("design_points", design = design), #' task = task, #' learner = learner, -#' resampling = rsmp("cv", folds = 3), -#' measure = msr("classif.ce"), -#' design = design +#' resampling = rsmp("holdout"), +#' measure = msr("classif.ce") #' ) #' -#' # best performing feature subset +#' # best performing feature set #' instance$result #' -#' # all evaluated feature subsets +#' # all evaluated feature sets #' as.data.table(instance$archive) #' #' # subset the task and fit the final model diff --git a/R/FSelectorExhaustiveSearch.R b/R/FSelectorExhaustiveSearch.R index dc9c452b..40b686e5 100644 --- a/R/FSelectorExhaustiveSearch.R +++ b/R/FSelectorExhaustiveSearch.R @@ -1,22 +1,27 @@ -#' @title Feature Selection via Exhaustive Search +#' @title Feature Selection with Exhaustive Search #' +#' @include mlr_fselectors.R #' @name mlr_fselectors_exhaustive_search #' #' @description -#' Exhaustive search generates all possible feature sets. +#' Feature Selection using the Exhaustive Search Algorithm. +#' Exhaustive Search generates all possible feature sets. #' +#' @details #' The feature selection terminates itself when all feature sets are evaluated. #' It is not necessary to set a termination criterion. #' #' @templateVar id exhaustive_search #' @template section_dictionary_fselectors #' -#' @section Parameters: +#' @section Control Parameters: #' \describe{ #' \item{`max_features`}{`integer(1)`\cr -#' Maximum number of features. By default, number of features in [mlr3::Task].} +#' Maximum number of features. +#' By default, number of features in [mlr3::Task].} #' } #' +#' @family FSelector #' @export #' @template example FSelectorExhaustiveSearch = R6Class("FSelectorExhaustiveSearch", diff --git a/R/FSelectorGeneticSearch.R b/R/FSelectorGeneticSearch.R index 365a1928..e4016313 100644 --- a/R/FSelectorGeneticSearch.R +++ b/R/FSelectorGeneticSearch.R @@ -1,30 +1,21 @@ -#' @title Feature Selection via Genetic Search +#' @title Feature Selection with Genetic Search #' +#' @include mlr_fselectors.R #' @name mlr_fselectors_genetic_search #' #' @description -#' Genetic search imitates the process of natural selection to generate feature sets. -#' -#' Calls [genalg::rbga.bin()] from package \CRANpkg{genalg}. +#' Feature selection using the Genetic Algorithm from the package \CRANpkg{genalg}. #' #' @templateVar id genetic_search #' @template section_dictionary_fselectors #' -#' @section Parameters: -#' \describe{ -#' \item{`suggestions`}{`list()`} -#' \item{`popSize`}{`integer(1)`} -#' \item{`mutationChance`}{`numeric(1)`} -#' \item{`elitism`}{`integer(1)`} -#' \item{`zeroToOneRatio`}{`integer(1)`} -#' \item{`iters`}{`integer(1)`} -#' } -#' +#' @section Control Parameters: #' For the meaning of the control parameters, see [genalg::rbga.bin()]. #' [genalg::rbga.bin()] internally terminates after `iters` iteration. #' We set `ìters = 100000` to allow the termination via our terminators. #' If more iterations are needed, set `ìters` to a higher value in the parameter set. #' +#' @family FSelector #' @export #' @template example FSelectorGeneticSearch = R6Class("FSelectorGeneticSearch", diff --git a/R/FSelectorRFE.R b/R/FSelectorRFE.R index 0caa0e59..5fe594a2 100644 --- a/R/FSelectorRFE.R +++ b/R/FSelectorRFE.R @@ -1,8 +1,10 @@ -#' @title Feature Selection via Recursive Feature Elimination +#' @title Feature Selection with Recursive Feature Elimination #' +#' @include mlr_fselectors.R #' @name mlr_fselectors_rfe #' #' @description +#' Feature selection using the Recursive Feature Elimination Algorithm (RFE). #' Recursive feature elimination iteratively removes features with a low importance score. #' Only works with [Learner]s that can calculate importance scores (see section on optional extractors in [Learner]). #' @@ -18,12 +20,14 @@ #' @templateVar id rfe #' @template section_dictionary_fselectors #' -#' @section Parameters: +#' @section Control Parameters: #' \describe{ #' \item{`n_features`}{`integer(1)`\cr -#' The number of features to select. By default half of the features are selected.} +#' The number of features to select. +#' By default half of the features are selected.} #' \item{`feature_fraction`}{`double(1)`\cr -#' Fraction of features to retain in each iteration, The default 0.5 retrains half of the features.} +#' Fraction of features to retain in each iteration. +#' The default 0.5 retrains half of the features.} #' \item{`feature_number`}{`integer(1)`\cr #' Number of features to remove in each iteration.} #' \item{`subset_sizes`}{`integer()`\cr @@ -35,18 +39,19 @@ #' #' The parameter `feature_fraction`, `feature_number` and `subset_sizes` are mutually exclusive. #' +#' @family FSelector #' @export #' @examples -#' # retrieve task -#' task = tsk("pima") +#' # Feature Selection +#' \donttest{ #' -#' # load learner +#' # retrieve task and load learner +#' task = tsk("penguins") #' learner = lrn("classif.rpart") #' -#' \donttest{ -#' # feature selection on the pima indians diabetes data set +#' # run feature selection on the Palmer Penguins data set #' instance = fselect( -#' method = "rfe", +#' method = fs("rfe"), #' task = task, #' learner = learner, #' resampling = rsmp("holdout"), diff --git a/R/FSelectorRandomSearch.R b/R/FSelectorRandomSearch.R index 0f48f2b2..5f5ccec1 100644 --- a/R/FSelectorRandomSearch.R +++ b/R/FSelectorRandomSearch.R @@ -1,20 +1,24 @@ -#' @title Feature Selection via Random Search +#' @title Feature Selection with Random Search #' +#' @include mlr_fselectors.R #' @name mlr_fselectors_random_search #' #' @description -#' Random search randomly draws feature sets. +#' Feature selection using Random Search Algorithm. #' -#' Feature sets are evaluated in batches of size `batch_size`. +#' @details +#' The feature sets are randomly drawn. +#' The sets are evaluated in batches of size `batch_size`. #' Larger batches mean we can parallelize more, smaller batches imply a more fine-grained checking of termination criteria. #' #' @templateVar id random_search #' @template section_dictionary_fselectors #' -#' @section Parameters: +#' @section Control Parameters: #' \describe{ #' \item{`max_features`}{`integer(1)`\cr -#' Maximum number of features. By default, number of features in [mlr3::Task].} +#' Maximum number of features. +#' By default, number of features in [mlr3::Task].} #' \item{`batch_size`}{`integer(1)`\cr #' Maximum number of feature sets to try in a batch.} #' } @@ -22,23 +26,24 @@ #' @source #' `r format_bib("bergstra_2012")` #' +#' @family FSelector #' @export #' @examples -#' # retrieve task -#' task = tsk("pima") +#' # Feature Selection +#' \donttest{ #' -#' # load learner +#' # retrieve task and load learner +#' task = tsk("penguins") #' learner = lrn("classif.rpart") #' -#' \donttest{ -#' # feature selection on the pima indians diabetes data set +#' # run feature selection on the Palmer Penguins data set #' instance = fselect( -#' method = "random_search", +#' method = fs("random_search"), #' task = task, #' learner = learner, #' resampling = rsmp("holdout"), #' measure = msr("classif.ce"), -#' term_evals = 100 +#' term_evals = 10 #' ) #' #' # best performing feature subset diff --git a/R/FSelectorSequential.R b/R/FSelectorSequential.R index d0dc7e23..e013a356 100644 --- a/R/FSelectorSequential.R +++ b/R/FSelectorSequential.R @@ -1,10 +1,12 @@ -#' @title Feature Selection via Sequential Search +#' @title Feature Selection with Sequential Search #' +#' @include mlr_fselectors.R #' @name mlr_fselectors_sequential #' #' @description -#' Sequential search iteratively adds features to the set. +#' Feature selection using Sequential Search Algorithm. #' +#' @details #' Sequential forward selection (`strategy = fsf`) extends the feature set in each iteration with the feature that increases the models performance the most. #' Sequential backward selection (`strategy = fsb`) follows the same idea but starts with all features and removes features from the set. #' @@ -14,16 +16,17 @@ #' @templateVar id sequential #' @template section_dictionary_fselectors #' -#' @section Parameters: +#' @section Control Parameters: #' \describe{ #' \item{`min_features`}{`integer(1)`\cr -#' Minimum number of features. By default, 1.} +#' Minimum number of features. By default, 1.} #' \item{`max_features`}{`integer(1)`\cr -#' Maximum number of features. By default, number of features in [mlr3::Task].} +#' Maximum number of features. By default, number of features in [mlr3::Task].} #' \item{`strategy`}{`character(1)`\cr -#' Search method `sfs` (forward search) or `sbs` (backward search).} +#' Search method `sfs` (forward search) or `sbs` (backward search).} #' } #' +#' @family FSelector #' @export #' @template example FSelectorSequential = R6Class("FSelectorSequential", diff --git a/R/FSelectorShadowVariableSearch.R b/R/FSelectorShadowVariableSearch.R index fc41421b..6995d39b 100644 --- a/R/FSelectorShadowVariableSearch.R +++ b/R/FSelectorShadowVariableSearch.R @@ -1,10 +1,13 @@ -#' @title Feature Selection via Shadow Variable Search +#' @title Feature Selection with Shadow Variable Search #' +#' @include mlr_fselectors.R #' @name mlr_fselectors_shadow_variable_search #' #' @description +#' Feature selection using the Shadow Variable Search Algorithm. #' Shadow variable search creates for each feature a permutated copy and stops when one of them is selected. #' +#' @details #' The feature selection terminates itself when the first shadow variable is selected. #' It is not necessary to set a termination criterion. #' @@ -14,18 +17,19 @@ #' @source #' `r format_bib("thomas2017", "wu2007")` #' +#' @family FSelector #' @export #' @examples -#' # retrieve task -#' task = tsk("pima") +#' # Feature Selection +#' \donttest{ #' -#' # load learner +#' # retrieve task and load learner +#' task = tsk("penguins") #' learner = lrn("classif.rpart") #' -#' \donttest{ -#' # feature selection on the pima indians diabetes data set +#' # run feature selection on the Palmer Penguins data set #' instance = fselect( -#' method = "shadow_variable_search", +#' method = fs("shadow_variable_search"), #' task = task, #' learner = learner, #' resampling = rsmp("holdout"), diff --git a/R/ObjectiveFSelect.R b/R/ObjectiveFSelect.R index c12acc4b..217835a6 100644 --- a/R/ObjectiveFSelect.R +++ b/R/ObjectiveFSelect.R @@ -1,9 +1,8 @@ -#' @title ObjectiveFSelect +#' @title Class for Feature Selection Objective #' #' @description -#' Stores the objective function that estimates the performance of feature -#' subsets. This class is usually constructed internally by by the -#' [FSelectInstanceSingleCrit] / [FSelectInstanceMultiCrit]. +#' Stores the objective function that estimates the performance of feature subsets. +#' This class is usually constructed internally by by the [FSelectInstanceSingleCrit] / [FSelectInstanceMultiCrit]. #' #' @template param_task #' @template param_learner @@ -16,21 +15,18 @@ #' @export ObjectiveFSelect = R6Class("ObjectiveFSelect", inherit = Objective, - - #' @description - #' Creates a new instance of this [R6][R6::R6Class] class. public = list( - #' @field task ([mlr3::Task]) + #' @field task ([mlr3::Task]). task = NULL, - #' @field learner ([mlr3::Learner]) + #' @field learner ([mlr3::Learner]). learner = NULL, - #' @field resampling ([mlr3::Resampling]) + #' @field resampling ([mlr3::Resampling]). resampling = NULL, - #' @field measures (list of [mlr3::Measure]) + #' @field measures (list of [mlr3::Measure]). measures = NULL, #' @field store_models (`logical(1)`). @@ -44,38 +40,32 @@ ObjectiveFSelect = R6Class("ObjectiveFSelect", #' @description #' Creates a new instance of this [R6][R6::R6Class] class. - initialize = function(task, learner, resampling, measures, - check_values = TRUE, store_benchmark_result = TRUE, - store_models = FALSE) { - + #' + #' @param archive ([ArchiveFSelect])\cr + #' Reference to the archive of [FSelectInstanceSingleCrit] | [FSelectInstanceMultiCrit]. + #' If `NULL` (default), benchmark result and models cannot be stored. + initialize = function(task, learner, resampling, measures, check_values = TRUE, store_benchmark_result = TRUE, store_models = FALSE, archive = NULL) { self$task = assert_task(as_task(task, clone = TRUE)) - self$learner = assert_learner(as_learner(learner, clone = TRUE), - task = self$task) - self$resampling = assert_resampling(as_resampling(resampling, - clone = TRUE)) - self$measures = assert_measures(as_measures(measures, clone = TRUE), - task = self$task, learner = self$learner) - self$store_benchmark_result = assert_logical(store_benchmark_result) - self$store_models = assert_logical(store_models) - if (!resampling$is_instantiated) { - self$resampling$instantiate(self$task) - } - - domain = ParamSet$new(map(self$task$feature_names, - function(s) ParamLgl$new(id = s))) - - codomain = ParamSet$new(map(self$measures, function(s) { - ParamDbl$new(id = s$id, - tags = ifelse(s$minimize, "minimize", "maximize")) - })) - - super$initialize(id = sprintf("%s_on_%s", self$learner$id, self$task$id), - domain = domain, codomain = codomain, check_values = check_values) + self$learner = assert_learner(as_learner(learner, clone = TRUE), task = self$task) + self$resampling = assert_resampling(as_resampling(resampling, clone = TRUE)) + self$measures = assert_measures(as_measures(measures, clone = TRUE), task = self$task, learner = self$learner) + + self$archive = assert_r6(archive, "ArchiveFSelect", null.ok = TRUE) + if (is.null(self$archive)) store_benchmark_result = store_models = FALSE + self$store_models = assert_flag(store_models) + self$store_benchmark_result = assert_flag(store_benchmark_result) || self$store_models + + if (!resampling$is_instantiated) self$resampling$instantiate(self$task) + + super$initialize( + id = sprintf("%s_on_%s", self$learner$id, self$task$id), + domain = task_to_domain(self$task), + codomain = measures_to_codomain(self$measures), + check_values = check_values) } ), private = list( - .eval_many = function(xss) { learners = map(xss, function(x) { state = self$task$feature_names[unlist(x)] @@ -84,28 +74,25 @@ ObjectiveFSelect = R6Class("ObjectiveFSelect", GraphLearner$new(graph) }) + # benchmark feature subsets design = benchmark_grid(self$task, learners, self$resampling) - bmr = benchmark(design, store_models = self$store_models) - aggr = bmr$aggregate(self$measures) - y = map_chr(self$measures, "id") + benchmark_result = benchmark(design, store_models = self$store_models) + + # aggregate performance scores + aggregated_performance = benchmark_result$aggregate(self$measures, conditions = TRUE)[, c(self$codomain$target_ids, "warnings", "errors"), with = FALSE] - # add runtime - time = map_dbl(bmr$resample_results$resample_result, function(rr) { - sum(map_dbl(rr$learners, function(l) sum(l$timings))) + # add runtime to evaluations + time = map_dbl(benchmark_result$resample_results$resample_result, function(rr) { + sum(map_dbl(get_private(rr)$.data$learner_states(get_private(rr)$.view), function(state) state$train_time + state$predict_time)) }) - aggr[, "runtime_learners" := time] + set(aggregated_performance, j = "runtime_learners", value = time) + # store benchmark result in archive if (self$store_benchmark_result) { - self$archive$benchmark_result = - if (is.null(self$archive$benchmark_result)) { - self$archive$benchmark_result = bmr - } else { - self$archive$benchmark_result$combine(bmr) - } - cbind(aggr[, c(y, "runtime_learners"), with = FALSE], uhash = bmr$uhashes) - } else { - aggr[, c(y, "runtime_learners"), with = FALSE] + self$archive$benchmark_result$combine(benchmark_result) + set(aggregated_performance, j = "uhash", value = benchmark_result$uhashes) } + aggregated_performance } ) ) diff --git a/R/auto_fselector.R b/R/auto_fselector.R index 49f01dfe..4549f594 100644 --- a/R/auto_fselector.R +++ b/R/auto_fselector.R @@ -26,15 +26,7 @@ #' @template param_check_values #' #' @export -#' @examples -#' at = auto_fselector( -#' method = "random_search", -#' learner = lrn("classif.rpart"), -#' resampling = rsmp ("holdout"), -#' measure = msr("classif.ce"), -#' term_evals = 4) -#' -#' at$train(tsk("pima")) +#' @inherit AutoFSelector examples auto_fselector = function(method, learner, resampling, measure = NULL, term_evals = NULL, term_time = NULL, terminator = NULL, store_fselect_instance = TRUE, store_benchmark_result = TRUE, store_models = FALSE, check_values = FALSE, ...) { fselector = if (is.character(method)) { assert_choice(method, mlr_fselectors$keys()) diff --git a/R/extract_inner_fselect_archives.R b/R/extract_inner_fselect_archives.R index 296c5602..f5d4397a 100644 --- a/R/extract_inner_fselect_archives.R +++ b/R/extract_inner_fselect_archives.R @@ -1,12 +1,10 @@ #' @title Extract Inner Feature Selection Archives #' #' @description -#' Extract inner feature selection archives of nested resampling. Implemented for -#' [mlr3::ResampleResult] and [mlr3::BenchmarkResult]. The function iterates -#' over the [AutoFSelector] objects and binds the archives to a -#' [data.table::data.table()]. [AutoFSelector] must be initialized with -#' `store_fselect_instance = TRUE` and `resample()` or `benchmark()` must be -#' called with `store_models = TRUE`. +#' Extract inner feature selection archives of nested resampling. +#' Implemented for [mlr3::ResampleResult] and [mlr3::BenchmarkResult]. +#' The function iterates over the [AutoFSelector] objects and binds the archives to a [data.table::data.table()]. +#' [AutoFSelector] must be initialized with `store_fselect_instance = TRUE` and `resample()` or `benchmark()` must be called with `store_models = TRUE`. #' #' @section Data structure: #' @@ -21,7 +19,7 @@ #' * `runtime_learners` (`numeric(1)`)\cr #' Sum of training and predict times logged in learners per #' [mlr3::ResampleResult] / evaluation. This does not include potential -#' overhead time. +#' overhead time. #' * `timestamp` (`POSIXct`)\cr #' Time stamp when the evaluation was logged into the archive. #' * `batch_nr` (`integer(1)`)\cr @@ -34,9 +32,6 @@ #' * `resampling_id` (`character(1)`). #' #' @param x ([mlr3::ResampleResult] | [mlr3::BenchmarkResult]). -#' @param unnest (`character()`)\cr -#' Transforms list columns to separate columns. Set to `NULL` if no column -#' should be unnested. #' @param exclude_columns (`character()`)\cr #' Exclude columns from result table. Set to `NULL` if no column should be #' excluded. @@ -44,29 +39,33 @@ #' #' @export #' @examples +#' # Nested Resampling on Palmer Penguins Data Set +#' +#' # create auto fselector #' at = auto_fselector( -#' method = "random_search", +#' method = fs("random_search"), #' learner = lrn("classif.rpart"), #' resampling = rsmp ("holdout"), #' measure = msr("classif.ce"), #' term_evals = 4) #' #' resampling_outer = rsmp("cv", folds = 2) -#' rr = resample(tsk("iris"), at, resampling_outer, store_models = TRUE) +#' rr = resample(tsk("penguins"), at, resampling_outer, store_models = TRUE) #' +#' # extract inner archives #' extract_inner_fselect_archives(rr) -extract_inner_fselect_archives = function (x, unnest = NULL, exclude_columns = "uhash") { +extract_inner_fselect_archives = function (x, exclude_columns = "uhash") { UseMethod("extract_inner_fselect_archives") } #' @export -extract_inner_fselect_archives.ResampleResult = function(x, unnest = NULL, exclude_columns = "uhash") { +extract_inner_fselect_archives.ResampleResult = function(x, exclude_columns = "uhash") { rr = assert_resample_result(x) if (is.null(rr$learners[[1]]$model$fselect_instance)) { return(data.table()) } tab = imap_dtr(rr$learners, function(learner, i) { - data = as.data.table(learner$archive, unnest, exclude_columns) + data = as.data.table(learner$archive, exclude_columns) set(data, j = "iteration", value = i) }) tab[, "task_id" := rr$task$id] @@ -79,10 +78,10 @@ extract_inner_fselect_archives.ResampleResult = function(x, unnest = NULL, exclu } #' @export -extract_inner_fselect_archives.BenchmarkResult = function(x, unnest = NULL, exclude_columns = "uhash") { +extract_inner_fselect_archives.BenchmarkResult = function(x, exclude_columns = "uhash") { bmr = assert_benchmark_result(x) tab = imap_dtr(bmr$resample_results$resample_result, function(rr, i) { - data = extract_inner_fselect_archives(rr, unnest, exclude_columns) + data = extract_inner_fselect_archives(rr, exclude_columns) if (nrow(data) > 0) set(data, j = "experiment", value = i) }, .fill = TRUE) @@ -93,4 +92,4 @@ extract_inner_fselect_archives.BenchmarkResult = function(x, unnest = NULL, excl setcolorder(tab, c("experiment", "iteration", cols_x, cols_y)) } tab -} \ No newline at end of file +} diff --git a/R/extract_inner_fselect_results.R b/R/extract_inner_fselect_results.R index ca27df8f..ca089a90 100644 --- a/R/extract_inner_fselect_results.R +++ b/R/extract_inner_fselect_results.R @@ -1,12 +1,13 @@ #' @title Extract Inner Feature Selection Results #' #' @description -#' Extract inner feature selection results of nested resampling. Implemented for -#' [mlr3::ResampleResult] and [mlr3::BenchmarkResult]. The function iterates -#' over the [AutoFSelector] objects and binds the feature selection results to a -#' [data.table::data.table()]. [AutoFSelector] must be initialized with -#' `store_fselect_instance = TRUE` and `resample()` or `benchmark()` must be -#' called with `store_models = TRUE`. +#' Extract inner feature selection results of nested resampling. +#' Implemented for [mlr3::ResampleResult] and [mlr3::BenchmarkResult]. +#' +#' @details +#' The function iterates over the [AutoFSelector] objects and binds the feature selection results to a [data.table::data.table()]. +#' [AutoFSelector] must be initialized with `store_fselect_instance = TRUE` and `resample()` or `benchmark()` must be called with `store_models = TRUE`. +#' Optionally, the instance can be added for each iteration. #' #' @section Data structure: #' @@ -25,12 +26,20 @@ #' * `resampling_id` (`character(1)`). #' #' @param x ([mlr3::ResampleResult] | [mlr3::BenchmarkResult]). +#' @param fselect_instance (`logical(1)`)\cr +#' If `TRUE`, instances are added to the table. +#' @param ... (any)\cr +#' Additional arguments. +#' #' @return [data.table::data.table()]. #' #' @export #' @examples +#' # Nested Resampling on Palmer Penguins Data Set +#' +#' # create auto fselector #' at = auto_fselector( -#' method = "random_search", +#' method = fs("random_search"), #' learner = lrn("classif.rpart"), #' resampling = rsmp ("holdout"), #' measure = msr("classif.ce"), @@ -39,13 +48,14 @@ #' resampling_outer = rsmp("cv", folds = 2) #' rr = resample(tsk("iris"), at, resampling_outer, store_models = TRUE) #' +#' # extract inner results #' extract_inner_fselect_results(rr) -extract_inner_fselect_results = function (x) { +extract_inner_fselect_results = function (x, fselect_instance, ...) { UseMethod("extract_inner_fselect_results", x) } #' @export -extract_inner_fselect_results.ResampleResult = function(x) { +extract_inner_fselect_results.ResampleResult = function(x, fselect_instance = FALSE, ...) { rr = assert_resample_result(x) if (is.null(rr$learners[[1]]$model$fselect_instance)) { return(data.table()) @@ -53,6 +63,8 @@ extract_inner_fselect_results.ResampleResult = function(x) { tab = imap_dtr(rr$learners, function(learner, i) { data = setalloccol(learner$fselect_result) set(data, j = "iteration", value = i) + if (fselect_instance) set(data, j = "fselect_instance", value = list(learner$fselect_instance)) + data }) tab[, "task_id" := rr$task$id] tab[, "learner_id" := rr$learner$id] @@ -64,10 +76,10 @@ extract_inner_fselect_results.ResampleResult = function(x) { } #' @export -extract_inner_fselect_results.BenchmarkResult = function(x) { +extract_inner_fselect_results.BenchmarkResult = function(x, fselect_instance = FALSE, ...) { bmr = assert_benchmark_result(x) tab = imap_dtr(bmr$resample_results$resample_result, function(rr, i) { - data = extract_inner_fselect_results(rr) + data = extract_inner_fselect_results(rr, fselect_instance = fselect_instance) if (nrow(data) > 0) set(data, j = "experiment", value = i) }, .fill = TRUE) # reorder dt @@ -77,4 +89,4 @@ extract_inner_fselect_results.BenchmarkResult = function(x) { setcolorder(tab, unique(c("experiment", "iteration", cols_x, cols_y))) } tab -} \ No newline at end of file +} diff --git a/R/fselect.R b/R/fselect.R index 6ee06d03..fa3bbd3b 100644 --- a/R/fselect.R +++ b/R/fselect.R @@ -1,10 +1,29 @@ #' @title Function for Feature Selection #' +#' @include FSelectInstanceSingleCrit.R ArchiveFSelect.R +#' #' @description -#' Function to optimize the feature set of a [mlr3::Learner]. +#' Function to optimize the features of a [mlr3::Learner]. +#' The function internally creates a [FSelectInstanceSingleCrit] or [FSelectInstanceMultiCrit] which describe the feature selection problem. +#' It executes the feature selection with the [FSelector] (`method`) and returns the result with the fselect instance (`$result`). +#' The [ArchiveFSelect] (`$archive`) stores all evaluated hyperparameter configurations and performance scores. +#' +#' @details +#' The [mlr3::Task], [mlr3::Learner], [mlr3::Resampling], [mlr3::Measure] and [Terminator] are used to construct a [FSelectInstanceSingleCrit]. +#' If multiple performance [Measures][Measure] are supplied, a [FSelectInstanceMultiCrit] is created. +#' The parameter `term_evals` and `term_time` are shortcuts to create a [Terminator]. +#' If both parameters are passed, a [TerminatorCombo] is constructed. +#' For other [Terminators][Terminator], pass one with `terminator`. +#' If no termination criterion is needed, set `term_evals`, `term_time` and `terminator` to `NULL`. +#' +#' @inheritSection FSelectInstanceSingleCrit Resources +#' @inheritSection ArchiveFSelect Analysis #' #' @param method (`character(1)` | [FSelector])\cr #' Key to retrieve fselector from [mlr_fselectors] dictionary or [FSelector] object. +#' @param measures ([mlr3::Measure] or list of [mlr3::Measure])\cr +#' A single measure creates a [FSelectInstanceSingleCrit] and multiple measures a [FSelectInstanceMultiCrit]. +#' If `NULL`, default measure is used. #' @param term_evals (`integer(1)`)\cr #' Number of allowed evaluations. #' @param term_time (`integer(1)`)\cr @@ -12,39 +31,58 @@ #' @param ... (named `list()`)\cr #' Named arguments to be set as parameters of the fselector. #' -#' @return `FSelectInstanceSingleCrit` | `FSelectInstanceMultiCrit` +#' @return [FSelectInstanceSingleCrit] | [FSelectInstanceMultiCrit] #' #' @template param_task #' @template param_learner #' @template param_resampling -#' @template param_measures +#' @template param_terminator +#' @template param_store_benchmark_result #' @template param_store_models +#' @template param_check_values #' #' @export #' @examples +#' # Feature selection on the Palmer Penguins data set #' task = tsk("pima") +#' learner = lrn("classif.rpart") #' +#' # Run feature selection #' instance = fselect( #' method = "random_search", #' task = task, -#' learner = lrn("classif.rpart"), +#' learner = learner, #' resampling = rsmp ("holdout"), #' measures = msr("classif.ce"), #' term_evals = 4) #' -#' # subset task to optimized feature set +#' # Subset task to optimized feature set #' task$select(instance$result_feature_set) -fselect = function(method, task, learner, resampling, measures, term_evals = NULL, term_time = NULL, store_models = FALSE, ...) { +#' +#' # Train the learner with optimal feature set on the full data set +#' learner$train(task) +#' +#' # Inspect all evaluated configurations +#' as.data.table(instance$archive) +fselect = function(method, task, learner, resampling, measures = NULL, term_evals = NULL, term_time = NULL, terminator = NULL, store_benchmark_result = TRUE, store_models = FALSE, check_values = FALSE, ...) { fselector = if (is.character(method)) { assert_choice(method, mlr_fselectors$keys()) fs(method, ...) } else { assert_fselector(method) } - terminator = terminator_selection(term_evals, term_time) + terminator = terminator %??% terminator_selection(term_evals, term_time) FSelectInstance = if (!is.list(measures)) FSelectInstanceSingleCrit else FSelectInstanceMultiCrit - instance = FSelectInstance$new(task, learner, resampling, measures, terminator, store_models = store_models) + instance = FSelectInstance$new( + task = task, + learner = learner, + resampling = resampling, + measures, + terminator = terminator, + store_benchmark_result = store_benchmark_result, + store_models = store_models, + check_values = check_values) fselector$optimize(instance) instance diff --git a/R/fselect_nested.R b/R/fselect_nested.R index f5edbc3f..4d689917 100644 --- a/R/fselect_nested.R +++ b/R/fselect_nested.R @@ -24,19 +24,20 @@ #' #' @export #' @examples +#' # Nested resampling on Palmer Penguins data set #' rr = fselect_nested( #' method = "random_search", -#' task = tsk("pima"), +#' task = tsk("penguins"), #' learner = lrn("classif.rpart"), #' inner_resampling = rsmp ("holdout"), #' outer_resampling = rsmp("cv", folds = 2), #' measure = msr("classif.ce"), #' term_evals = 4) #' -#' # performance scores estimated on the outer resampling +#' # Performance scores estimated on the outer resampling #' rr$score() #' -#' # unbiased performance of the final model trained on the full data set +#' # Unbiased performance of the final model trained on the full data set #' rr$aggregate() fselect_nested = function(method, task, learner, inner_resampling, outer_resampling, measure, term_evals = NULL, term_time = NULL, ...) { @@ -46,4 +47,4 @@ fselect_nested = function(method, task, learner, inner_resampling, outer_resampl afs = auto_fselector(method, learner, inner_resampling, measure, term_evals, term_time, ...) resample(task, afs, outer_resampling, store_models = TRUE) -} \ No newline at end of file +} diff --git a/R/helper.R b/R/helper.R index bdd299a8..cb87618c 100644 --- a/R/helper.R +++ b/R/helper.R @@ -1,3 +1,9 @@ -catn = function(..., file = "") { - cat(paste0(..., collapse = "\n"), "\n", sep = "", file = file) +task_to_domain = function(task) { + ParamSet$new(map(task$feature_names, function(s) ParamLgl$new(id = s))) +} + +measures_to_codomain = function(measures) { + Codomain$new(map(as_measures(measures), function(s) { + ParamDbl$new(id = s$id, tags = ifelse(s$minimize, "minimize", "maximize")) + })) } diff --git a/R/sugar.R b/R/sugar.R index 4832282b..3eed14f1 100644 --- a/R/sugar.R +++ b/R/sugar.R @@ -1,14 +1,24 @@ #' @title Syntactic Sugar for FSelect Construction #' #' @description -#' This function complements [mlr_fselectors] with functions in the spirit -#' of [mlr3::mlr_sugar]. +#' Functions to retrieve objects, set parameters and assign to fields in one go. +#' Relies on [mlr3misc::dictionary_sugar_get()] to extract objects from the respective [mlr3misc::Dictionary]: +#' +#' * `fs()` for a [FSelector] from [mlr_fselectors]. +#' * `fss()` for a list of [FSelectors][FSelector] from [mlr_fselectors]. +#' * `trm()` for a [Terminator] from [mlr_terminators]. +#' * `trms()` for a list of [Terminators][Terminator] from [mlr_terminators]. #' #' @inheritParams mlr3::mlr_sugar -#' @return [FSelector]. +#' @return [R6::R6Class] object of the respective type, or a list of [R6::R6Class] objects for the plural versions. +#' #' @export #' @examples -#' fs("sequential", max_features = 4) +#' # random search with batch size of 5 +#' fs("random_search", batch_size = 5) +#' +#' # run time terminator with 20 seconds +#' trm("run_time", secs = 20) fs = function(.key, ...) { dictionary_sugar_get(mlr_fselectors, .key, ...) } @@ -18,3 +28,29 @@ fs = function(.key, ...) { fss = function(.keys, ...) { dictionary_sugar_mget(mlr_fselectors, .keys, ...) } + +#' @title Syntactic Sugar for Instance Construction +#' +#' @description +#' Function to construct a [FSelectInstanceSingleCrit] or [FSelectInstanceMultiCrit]. +#' +#' @param measures ([mlr3::Measure] or list of [mlr3::Measure])\cr +#' A single measure creates a [FSelectInstanceSingleCrit] and multiple measures a [FSelectInstanceMultiCrit]. +#' If `NULL`, default measure is used. +#' +#' @template param_task +#' @template param_learner +#' @template param_resampling +#' @template param_terminator +#' @template param_store_benchmark_result +#' @template param_store_models +#' @template param_check_values +#' +#' @inheritSection FSelectInstanceSingleCrit Resources +#' +#' @export +#' @inherit FSelectInstanceSingleCrit examples +fsi = function(task, learner, resampling, measures = NULL, terminator, store_benchmark_result = TRUE, store_models = FALSE, check_values = FALSE) { + FSelectInstance = if (!is.list(measures)) FSelectInstanceSingleCrit else FSelectInstanceMultiCrit + FSelectInstance$new(task, learner, resampling, measures, terminator, store_benchmark_result, store_models, check_values) +} diff --git a/man-roxygen/example.R b/man-roxygen/example.R index 810ce700..be96f193 100644 --- a/man-roxygen/example.R +++ b/man-roxygen/example.R @@ -1,12 +1,12 @@ #' @examples -#' # retrieve task -#' task = tsk("pima") +#' # Feature Selection +#' \donttest{ #' -#' # load learner +#' # retrieve task and load learner +#' task = tsk("penguins") #' learner = lrn("classif.rpart") #' -#' \donttest{ -#' # feature selection on the pima indians diabetes data set +#' # run feature selection on the Palmer Penguins data set #' instance = fselect( #' method = "<%= id %>", #' task = task, @@ -16,10 +16,10 @@ #' term_evals = 10 #' ) #' -#' # best performing feature subset +#' # best performing feature set #' instance$result #' -#' # all evaluated feature subsets +#' # all evaluated feature sets #' as.data.table(instance$archive) #' #' # subset the task and fit the final model diff --git a/man-roxygen/section_dictionary_fselectors.R b/man-roxygen/section_dictionary_fselectors.R index 92448353..384aa101 100644 --- a/man-roxygen/section_dictionary_fselectors.R +++ b/man-roxygen/section_dictionary_fselectors.R @@ -1,7 +1,5 @@ #' @section Dictionary: -#' This [FSelector] can be instantiated via the [dictionary][mlr3misc::Dictionary] -#' [mlr_fselectors] or with the associated sugar function [fs()]: +#' This [FSelector] can be instantiated with the associated sugar function [fs()]: #' ``` -#' mlr_fselectors$get("<%= id %>") #' fs("<%= id %>") #' ``` diff --git a/man/ArchiveFSelect.Rd b/man/ArchiveFSelect.Rd index 83e2b1ef..c2eaef82 100644 --- a/man/ArchiveFSelect.Rd +++ b/man/ArchiveFSelect.Rd @@ -2,10 +2,18 @@ % Please edit documentation in R/ArchiveFSelect.R \name{ArchiveFSelect} \alias{ArchiveFSelect} -\title{Logging Object for Evaluated Feature Sets} +\title{Class for Logging Evaluated Feature Sets} \description{ -Container around a \code{\link[data.table:data.table]{data.table::data.table()}} which stores all evaluated -feature sets and performance scores. +The \link{ArchiveFSelect} stores all evaluated feature sets and performance scores. +} +\details{ +The \link{ArchiveFSelect} is a container around a \code{\link[data.table:data.table]{data.table::data.table()}}. +Each row corresponds to a single evaluation of a feature set. +See the section on Data Structure for more information. +The archive stores additionally a \link[mlr3:BenchmarkResult]{mlr3::BenchmarkResult} (\verb{$benchmark_result}) that records the resampling experiments. +Each experiment corresponds to to a single evaluation of a feature set. +The table (\verb{$data}) and the benchmark result (\verb{$benchmark_result}) are linked by the \code{uhash} column. +If the archive is passed to \code{as.data.table()}, both are joined automatically. } \section{Data structure}{ @@ -15,59 +23,40 @@ The table (\verb{$data}) has the following columns: \item One column for each feature of the task (\verb{$search_space}). \item One column for each performance measure (\verb{$codomain}). \item \code{runtime_learners} (\code{numeric(1)})\cr -Sum of training and predict times logged in learners per -\link[mlr3:ResampleResult]{mlr3::ResampleResult} / evaluation. This does not include potential -overhead time. +Sum of training and predict times logged in learners per \link[mlr3:ResampleResult]{mlr3::ResampleResult} / evaluation. +This does not include potential overhead time. \item \code{timestamp} (\code{POSIXct})\cr Time stamp when the evaluation was logged into the archive. \item \code{batch_nr} (\code{integer(1)})\cr -Feature sets are evaluated in batches. Each batch has a unique batch -number. +Feature sets are evaluated in batches. Each batch has a unique batch number. \item \code{uhash} (\code{character(1)})\cr -Connects each feature set to the resampling experiment -stored in the \link[mlr3:BenchmarkResult]{mlr3::BenchmarkResult}. +Connects each feature set to the resampling experiment stored in the \link[mlr3:BenchmarkResult]{mlr3::BenchmarkResult}. } - -Each row corresponds to a single evaluation of a feature set. - -The archive stores additionally a \link[mlr3:BenchmarkResult]{mlr3::BenchmarkResult} -(\verb{$benchmark_result}) that records the resampling experiments. Each -experiment corresponds to to a single evaluation of a feature set. The table -(\verb{$data}) and the benchmark result (\verb{$benchmark_result}) are linked by the -\code{uhash} column. If the results are viewed with \code{as.data.table()}, both are -joined automatically. } \section{Analysis}{ - -For analyzing the feature selection results, it is recommended to pass the archive to -\code{as.data.table()}. The returned data table is joined with the benchmark result -which adds the \link[mlr3:ResampleResult]{mlr3::ResampleResult} for each feature set. +For analyzing the feature selection results, it is recommended to pass the archive to \code{as.data.table()}. +The returned data table is joined with the benchmark result which adds the \link[mlr3:ResampleResult]{mlr3::ResampleResult} for each feature set. The archive provides various getters (e.g. \verb{$learners()}) to ease the access. -All getters extract by position (\code{i}) or unique hash (\code{uhash}). For a -complete list of all getters see the methods section. +All getters extract by position (\code{i}) or unique hash (\code{uhash}). +For a complete list of all getters see the methods section. -The benchmark result (\verb{$benchmark_result}) allows to score the feature sets -again on a different measure. Alternatively, measures can be supplied to -\code{as.data.table()}. +The benchmark result (\verb{$benchmark_result}) allows to score the feature sets again on a different measure. +Alternatively, measures can be supplied to \code{as.data.table()}. } \section{S3 Methods}{ \itemize{ -\item \code{as.data.table.ArchiveFSelect(x, unnest = NULL, exclude_columns = "uhash", measures = NULL)}\cr +\item \code{as.data.table.ArchiveFSelect(x, exclude_columns = "uhash", measures = NULL)}\cr Returns a tabular view of all evaluated feature sets.\cr \link{ArchiveFSelect} -> \code{\link[data.table:data.table]{data.table::data.table()}}\cr \itemize{ \item \code{x} (\link{ArchiveFSelect}) -\item \code{unnest} (\code{character()})\cr -Transforms list columns to separate columns. Set to \code{NULL} if no column -should be unnested. \item \code{exclude_columns} (\code{character()})\cr -Exclude columns from table. Set to \code{NULL} if no column should be -excluded. +Exclude columns from table. Set to \code{NULL} if no column should be excluded. \item \code{measures} (list of \link[mlr3:Measure]{mlr3::Measure})\cr Score feature sets on additional measures. } @@ -81,13 +70,14 @@ Score feature sets on additional measures. \if{html}{\out{
}} \describe{ \item{\code{benchmark_result}}{(\link[mlr3:BenchmarkResult]{mlr3::BenchmarkResult})\cr -Stores benchmark result.} +Benchmark result.} } \if{html}{\out{
}} } \section{Methods}{ \subsection{Public methods}{ \itemize{ +\item \href{#method-ArchiveFSelect-new}{\code{ArchiveFSelect$new()}} \item \href{#method-ArchiveFSelect-learner}{\code{ArchiveFSelect$learner()}} \item \href{#method-ArchiveFSelect-learners}{\code{ArchiveFSelect$learners()}} \item \href{#method-ArchiveFSelect-predictions}{\code{ArchiveFSelect$predictions()}} @@ -97,25 +87,49 @@ Stores benchmark result.} } } \if{html}{\out{ -
Inherited methods +
Inherited methods
}} \if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ArchiveFSelect-new}{}}} +\subsection{Method \code{new()}}{ +Creates a new instance of this \link[R6:R6Class]{R6} class. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ArchiveFSelect$new(search_space, codomain, check_values = TRUE)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{search_space}}{(\link[paradox:ParamSet]{paradox::ParamSet})\cr +Search space. +Internally created from provided \link[mlr3:Task]{mlr3::Task} by instance.} + +\item{\code{codomain}}{(\link[bbotk:Codomain]{bbotk::Codomain})\cr +Specifies codomain of objective function i.e. a set of performance measures. +Internally created from provided \link[mlr3:Measure]{mlr3::Measure}s by instance.} + +\item{\code{check_values}}{(\code{logical(1)})\cr +If \code{TRUE} (default), hyperparameter configurations are check for validity.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-ArchiveFSelect-learner}{}}} \subsection{Method \code{learner()}}{ -Retrieve \link[mlr3:Learner]{mlr3::Learner} of the i-th evaluation, by position -or by unique hash \code{uhash}. \code{i} and \code{uhash} are mutually exclusive. -Learner does not contain a model. Use \verb{$learners()} to get learners with -models. +Retrieve \link[mlr3:Learner]{mlr3::Learner} of the i-th evaluation, by position or by unique hash \code{uhash}. +\code{i} and \code{uhash} are mutually exclusive. +Learner does not contain a model. Use \verb{$learners()} to get learners with models. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{ArchiveFSelect$learner(i = NULL, uhash = NULL)}\if{html}{\out{
}} } diff --git a/man/AutoFSelector.Rd b/man/AutoFSelector.Rd index ed1cf498..00c864ff 100644 --- a/man/AutoFSelector.Rd +++ b/man/AutoFSelector.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/AutoFSelector.R \name{AutoFSelector} \alias{AutoFSelector} -\title{AutoFSelector} +\title{Class for Automatic Feature Selection} \description{ The \link{AutoFSelector} wraps a \link[mlr3:Learner]{mlr3::Learner} and augments it with an automatic feature selection. The \code{\link[=auto_fselector]{auto_fselector()}} function creates an \link{AutoFSelector} object. @@ -33,12 +33,14 @@ For this reason it is not feasible to pass an instantiated \link[mlr3:Resampling } \examples{ -# Automafsic Feafsure Selection +# Automatic Feature Selection +\donttest{ +# split to train and external set task = tsk("penguins") -train_set = sample(task$nrow, 0.8 * task$nrow) -test_set = setdiff(seq_len(task$nrow), train_set) +split = partition(task, ratio = 0.8) +# create auto fselector afs = auto_fselector( method = fs("random_search"), learner = lrn("classif.rpart"), @@ -46,13 +48,13 @@ afs = auto_fselector( measure = msr("classif.ce"), term_evals = 4) -# optimize feafsure subset and fit final model -afs$train(task, row_ids = train_set) +# optimize feature subset and fit final model +afs$train(task, row_ids = split$train) # predict with final model -afs$predict(task, row_ids = test_set) +afs$predict(task, row_ids = split$test) -# show fselect result +# show result afs$fselect_result # model slot contains trained learner and fselect instance @@ -83,9 +85,10 @@ extract_inner_fselect_results(rr) # performance scores estimated on the outer resampling rr$score() -# unbiased performance of the final model trained on the full dafsa set +# unbiased performance of the final model trained on the full data set rr$aggregate() } +} \section{Super class}{ \code{\link[mlr3:Learner]{mlr3::Learner}} -> \code{AutoFSelector} } diff --git a/man/FSelectInstanceMultiCrit.Rd b/man/FSelectInstanceMultiCrit.Rd index 084c8f83..6c390742 100644 --- a/man/FSelectInstanceMultiCrit.Rd +++ b/man/FSelectInstanceMultiCrit.Rd @@ -2,57 +2,56 @@ % Please edit documentation in R/FSelectInstanceMultiCrit.R \name{FSelectInstanceMultiCrit} \alias{FSelectInstanceMultiCrit} -\title{Multi Criterion Feature Selection Instance} +\title{Class for Multi Criteria Feature Selection} \description{ -Specifies a general feature selection scenario, including objective function -and archive for feature selection algorithms to act upon. This class stores -an \link{ObjectiveFSelect} object that encodes the black box objective function -which an \link{FSelector} has to optimize. It allows the basic operations of -querying the objective at feature subsets (\verb{$eval_batch()}), storing the -evaluations in the internal \link[bbotk:Archive]{bbotk::Archive} and accessing the final result -(\verb{$result}). - -Evaluations of feature subsets are performed in batches by calling -\code{\link[mlr3:benchmark]{mlr3::benchmark()}} internally. Before a batch is evaluated, the -\link[bbotk:Terminator]{bbotk::Terminator} is queried for the remaining budget. If the available -budget is exhausted, an exception is raised, and no further evaluations can -be performed from this point on. - -The \link{FSelector} is also supposed to store its final result, consisting -of the selected feature subsets and associated estimated performance values, by -calling the method \code{instance$assign_result()}. +The \link{FSelectInstanceMultiCrit} specifies a feature selection problem for \link[=FSelector]{FSelectors}. +The function \code{\link[=fsi]{fsi()}} creates a \link{FSelectInstanceMultiCrit} and the function \code{\link[=fselect]{fselect()}} creates an instance internally. +} +\section{Resources}{ + +\itemize{ +\item \href{https://mlr3book.mlr-org.com/feature-selection.html#fs-wrapper}{book chapter} on feature selection. +\item \href{https://mlr-org.com/gallery/2020-09-14-mlr3fselect-basic/}{gallery post} on feature selection on the Titanic data set. +} } -\examples{ -library(mlr3) -library(data.table) -# Objects required to define the performance evaluator -task = tsk("iris") -measures = msrs(c("classif.ce", "classif.acc")) -learner = lrn("classif.rpart") -resampling = rsmp("cv") -terminator = trm("evals", n_evals = 8) +\section{Analysis}{ -inst = FSelectInstanceMultiCrit$new( +For analyzing the feature selection results, it is recommended to pass the archive to \code{as.data.table()}. +The returned data table is joined with the benchmark result which adds the \link[mlr3:ResampleResult]{mlr3::ResampleResult} for each feature set. + +The archive provides various getters (e.g. \verb{$learners()}) to ease the access. +All getters extract by position (\code{i}) or unique hash (\code{uhash}). +For a complete list of all getters see the methods section. + +The benchmark result (\verb{$benchmark_result}) allows to score the feature sets again on a different measure. +Alternatively, measures can be supplied to \code{as.data.table()}. +} + +\examples{ +# Feature selection on Palmer Penguins data set +task = tsk("penguins") + +# Construct feature selection instance +instance = fsi( task = task, - learner = learner, - resampling = resampling, - measures = measures, - terminator = terminator + learner = lrn("classif.rpart"), + resampling = rsmp("cv", folds = 3), + measures = msrs(c("classif.ce", "time_train")), + terminator = trm("evals", n_evals = 4) ) -# Try some feature subsets -xdt = data.table( - Petal.Length = c(TRUE, FALSE), - Petal.Width = c(FALSE, TRUE), - Sepal.Length = c(TRUE, FALSE), - Sepal.Width = c(FALSE, TRUE) -) +# Choose optimization algorithm +fselector = fs("random_search", batch_size = 2) + +# Run feature selection +fselector$optimize(instance) -inst$eval_batch(xdt) +# Optimal feature sets +instance$result_feature_set -# Get archive data -as.data.table(inst$archive) +# Inspect all evaluated sets +as.data.table(instance$archive) } \section{Super classes}{ \code{\link[bbotk:OptimInstance]{bbotk::OptimInstance}} -> \code{\link[bbotk:OptimInstanceMultiCrit]{bbotk::OptimInstanceMultiCrit}} -> \code{FSelectInstanceMultiCrit} @@ -60,7 +59,7 @@ as.data.table(inst$archive) \section{Active bindings}{ \if{html}{\out{
}} \describe{ -\item{\code{result_feature_set}}{(\code{list()} of \code{character()})\cr +\item{\code{result_feature_set}}{(list of \code{character()})\cr Feature sets for task subsetting.} } \if{html}{\out{
}} @@ -96,9 +95,9 @@ Creates a new instance of this \link[R6:R6Class]{R6} class. resampling, measures, terminator, + store_benchmark_result = TRUE, store_models = FALSE, - check_values = TRUE, - store_benchmark_result = TRUE + check_values = FALSE )}\if{html}{\out{}} } @@ -123,15 +122,15 @@ If \code{NULL}, \CRANpkg{mlr3}'s default measure is used.} \item{\code{terminator}}{(\link{Terminator})\cr Stop criterion of the feature selection.} +\item{\code{store_benchmark_result}}{(\code{logical(1)})\cr +Store benchmark result in archive?} + \item{\code{store_models}}{(\code{logical(1)}). Store models in benchmark result?} \item{\code{check_values}}{(\code{logical(1)})\cr Check the parameters before the evaluation and the results for validity?} - -\item{\code{store_benchmark_result}}{(\code{logical(1)})\cr -Store benchmark result in archive?} } \if{html}{\out{}} } @@ -140,8 +139,8 @@ Store benchmark result in archive?} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-FSelectInstanceMultiCrit-assign_result}{}}} \subsection{Method \code{assign_result()}}{ -The \link{FSelector} object writes the best found feature subsets -and estimated performance values here. For internal use. +The \link{FSelector} object writes the best found feature subsets and estimated performance values here. +For internal use. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{FSelectInstanceMultiCrit$assign_result(xdt, ydt)}\if{html}{\out{
}} } diff --git a/man/FSelectInstanceSingleCrit.Rd b/man/FSelectInstanceSingleCrit.Rd index eac4953c..f97187fe 100644 --- a/man/FSelectInstanceSingleCrit.Rd +++ b/man/FSelectInstanceSingleCrit.Rd @@ -2,58 +2,69 @@ % Please edit documentation in R/FSelectInstanceSingleCrit.R \name{FSelectInstanceSingleCrit} \alias{FSelectInstanceSingleCrit} -\title{Single Criterion Feature Selection Instance} +\title{Class for Single Criterion Feature Selection} \description{ -Specifies a general feature selection scenario, including objective function -and archive for feature selection algorithms to act upon. This class stores -an \link{ObjectiveFSelect} object that encodes the black box objective function -which an \link{FSelector} has to optimize. It allows the basic operations of -querying the objective at feature subsets (\verb{$eval_batch()}), storing the -evaluations in the internal \link[bbotk:Archive]{bbotk::Archive} and accessing the final result -(\verb{$result}). - -Evaluations of feature subsets are performed in batches by calling -\code{\link[mlr3:benchmark]{mlr3::benchmark()}} internally. Before a batch is evaluated, the -\link[bbotk:Terminator]{bbotk::Terminator} is queried for the remaining budget. If the available -budget is exhausted, an exception is raised, and no further evaluations can -be performed from this point on. - -The \link{FSelector} is also supposed to store its final result, consisting -of a selected feature subset and associated estimated performance values, by -calling the method \code{instance$assign_result()}. +The \link{FSelectInstanceSingleCrit} specifies a feature selection problem for \link[=FSelector]{FSelectors}. +The function \code{\link[=fsi]{fsi()}} creates a \link{FSelectInstanceSingleCrit} and the function \code{\link[=fselect]{fselect()}} creates an instance internally. + +The instance contains an \link{ObjectiveFSelect} object that encodes the black box objective function a \link{FSelector} has to optimize. +The instance allows the basic operations of querying the objective at design points (\verb{$eval_batch()}). +This operation is usually done by the \link{FSelector}. +Evaluations of feature subsets are performed in batches by calling \code{\link[mlr3:benchmark]{mlr3::benchmark()}} internally. +The evaluated feature subsets are stored in the \link[=ArchiveFSelect]{Archive} (\verb{$archive}). +Before a batch is evaluated, the \link[bbotk:Terminator]{bbotk::Terminator} is queried for the remaining budget. +If the available budget is exhausted, an exception is raised, and no further evaluations can be performed from this point on. +The \link{FSelector} is also supposed to store its final result, consisting of a selected feature subset and associated estimated performance values, by calling the method \code{instance$assign_result()}. +} +\section{Resources}{ + +\itemize{ +\item \href{https://mlr3book.mlr-org.com/feature-selection.html#fs-wrapper}{book chapter} on feature selection. +\item \href{https://mlr-org.com/gallery/2020-09-14-mlr3fselect-basic/}{gallery post} on feature selection on the Titanic data set. +} +} + +\section{Analysis}{ + +For analyzing the feature selection results, it is recommended to pass the archive to \code{as.data.table()}. +The returned data table is joined with the benchmark result which adds the \link[mlr3:ResampleResult]{mlr3::ResampleResult} for each feature set. + +The archive provides various getters (e.g. \verb{$learners()}) to ease the access. +All getters extract by position (\code{i}) or unique hash (\code{uhash}). +For a complete list of all getters see the methods section. + +The benchmark result (\verb{$benchmark_result}) allows to score the feature sets again on a different measure. +Alternatively, measures can be supplied to \code{as.data.table()}. } -\examples{ -library(mlr3) -library(data.table) -# Objects required to define the objective function -task = tsk("iris") -measure = msr("classif.ce") +\examples{ +# Feature selection on Palmer Penguins data set +task = tsk("penguins") learner = lrn("classif.rpart") -resampling = rsmp("cv") -# Create instance -terminator = trm("evals", n_evals = 8) -inst = FSelectInstanceSingleCrit$new( +# Construct feature selection instance +instance = fsi( task = task, learner = learner, - resampling = resampling, - measure = measure, - terminator = terminator + resampling = rsmp("cv", folds = 3), + measures = msr("classif.ce"), + terminator = trm("evals", n_evals = 4) ) -# Try some feature subsets -xdt = data.table( - Petal.Length = c(TRUE, FALSE), - Petal.Width = c(FALSE, TRUE), - Sepal.Length = c(TRUE, FALSE), - Sepal.Width = c(FALSE, TRUE) -) +# Choose optimization algorithm +fselector = fs("random_search", batch_size = 2) + +# Run feature selection +fselector$optimize(instance) -inst$eval_batch(xdt) +# Subset task to optimal feature set +task$select(instance$result_feature_set) -# Get archive data -as.data.table(inst$archive) +# Train the learner with optimal feature set on the full data set +learner$train(task) + +# Inspect all evaluated sets +as.data.table(instance$archive) } \section{Super classes}{ \code{\link[bbotk:OptimInstance]{bbotk::OptimInstance}} -> \code{\link[bbotk:OptimInstanceSingleCrit]{bbotk::OptimInstanceSingleCrit}} -> \code{FSelectInstanceSingleCrit} @@ -97,9 +108,9 @@ Creates a new instance of this \link[R6:R6Class]{R6} class. resampling, measure, terminator, + store_benchmark_result = TRUE, store_models = FALSE, - check_values = TRUE, - store_benchmark_result = TRUE + check_values = FALSE )}\if{html}{\out{}} } @@ -123,15 +134,15 @@ Measure to optimize. If \code{NULL}, default measure is used.} \item{\code{terminator}}{(\link{Terminator})\cr Stop criterion of the feature selection.} +\item{\code{store_benchmark_result}}{(\code{logical(1)})\cr +Store benchmark result in archive?} + \item{\code{store_models}}{(\code{logical(1)}). Store models in benchmark result?} \item{\code{check_values}}{(\code{logical(1)})\cr Check the parameters before the evaluation and the results for validity?} - -\item{\code{store_benchmark_result}}{(\code{logical(1)})\cr -Store benchmark result in archive?} } \if{html}{\out{}} } @@ -140,8 +151,8 @@ Store benchmark result in archive?} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-FSelectInstanceSingleCrit-assign_result}{}}} \subsection{Method \code{assign_result()}}{ -The \link{FSelector} writes the best found feature subset -and estimated performance value here. For internal use. +The \link{FSelector} writes the best found feature subset and estimated performance value here. +For internal use. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{FSelectInstanceSingleCrit$assign_result(xdt, y)}\if{html}{\out{
}} } diff --git a/man/FSelector.Rd b/man/FSelector.Rd index c6c1948e..a221a486 100644 --- a/man/FSelector.Rd +++ b/man/FSelector.Rd @@ -2,58 +2,43 @@ % Please edit documentation in R/FSelector.R \name{FSelector} \alias{FSelector} -\title{FSelector} +\title{Class for Feature Selection Algorithms} \description{ -Abstract \code{FSelector} class that implements the base functionality each -fselector must provide. A \code{FSelector} object describes the feature selection -strategy, i.e. how to optimize the black-box function and its feasible set -defined by the \link{FSelectInstanceSingleCrit} / \link{FSelectInstanceMultiCrit} object. - -A fselector must write its result into the \link{FSelectInstanceSingleCrit} / -\link{FSelectInstanceMultiCrit} using the \code{assign_result} method of the -\link[bbotk:OptimInstance]{bbotk::OptimInstance} at the end of its selection in order to store the best -selected feature subset and its estimated performance vector. +The \link{FSelector} implements the optimization algorithm. +} +\details{ +\link{FSelector} is a abstract base class that implements the base functionality each fselector must provide. +A subclass is implemented in the following way: +\itemize{ +\item Inherit from FSelector. +\item Specify the private abstract method \verb{$.optimize()} and use it to call into your optimizer. +\item You need to call \code{instance$eval_batch()} to evaluate design points. +\item The batch evaluation is requested at the \link{FSelectInstanceSingleCrit}/\link{FSelectInstanceMultiCrit} object \code{instance}, so each batch is possibly executed in parallel via \code{\link[mlr3:benchmark]{mlr3::benchmark()}}, and all evaluations are stored inside of \code{instance$archive}. +\item Before the batch evaluation, the \link[bbotk:Terminator]{bbotk::Terminator} is checked, and if it is positive, an exception of class \code{"terminated_error"} is generated. +In the later case the current batch of evaluations is still stored in \code{instance}, but the numeric scores are not sent back to the handling optimizer as it has lost execution control. +\item After such an exception was caught we select the best set from \code{instance$archive} and return it. +\item Note that therefore more points than specified by the \link[bbotk:Terminator]{bbotk::Terminator} may be evaluated, as the Terminator is only checked before a batch evaluation, and not in-between evaluation in a batch. +How many more depends on the setting of the batch size. +\item Overwrite the private super-method \code{.assign_result()} if you want to decide yourself how to estimate the final set in the instance and its estimated performance. +The default behavior is: We pick the best resample-experiment, regarding the given measure, then assign its set and aggregated performance to the instance. +} } \section{Private Methods}{ \itemize{ \item \code{.optimize(instance)} -> \code{NULL}\cr -Abstract base method. Implement to specify feature selection of your -subclass. See technical details sections. +Abstract base method. Implement to specify feature selection of your subclass. +See technical details sections. \item \code{.assign_result(instance)} -> \code{NULL}\cr -Abstract base method. Implement to specify how the final feature subset is -selected. See technical details sections. +Abstract base method. Implement to specify how the final feature subset is selected. +See technical details sections. } } -\section{Technical Details and Subclasses}{ +\section{Resources}{ -A subclass is implemented in the following way: \itemize{ -\item Inherit from \code{FSelector}. -\item Specify the private abstract method \verb{$.optimize()} and use it to call into -your optimizer. -\item You need to call \code{instance$eval_batch()} to evaluate feature subsets. -\item The batch evaluation is requested at the \link{FSelectInstanceSingleCrit} / -\link{FSelectInstanceMultiCrit} object \code{instance}, so each batch is possibly -executed in parallel via \code{\link[mlr3:benchmark]{mlr3::benchmark()}}, and all evaluations are stored -inside of \code{instance$archive}. -\item Before the batch evaluation, the \link[bbotk:Terminator]{bbotk::Terminator} is checked, and if it is -positive, an exception of class \code{"terminated_error"} is generated. In the -later case the current batch of evaluations is still stored in \code{instance}, -but the numeric scores are not sent back to the handling optimizer as it has -lost execution control. -\item After such an exception was caught we select the best feature subset from -\code{instance$archive} and return it. -\item Note that therefore more points than specified by the \link[bbotk:Terminator]{bbotk::Terminator} -may be evaluated, as the Terminator is only checked before a batch -evaluation, and not in-between evaluation in a batch. How many more depends -on the setting of the batch size. -\item Overwrite the private super-method \code{.assign_result()} if you want to decide -yourself how to estimate the final feature subset in the instance and its -estimated performance. The default behavior is: We pick the best -resample-experiment, regarding the given measure, then assign its -feature subset and aggregated performance to the instance. +\item \href{https://mlr3book.mlr-org.com/feature-selection.html#the-fselector-class}{book section} on feature selection algorithms. } } @@ -185,10 +170,8 @@ Opens the corresponding help page referenced by field \verb{$man}. \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-FSelector-optimize}{}}} \subsection{Method \code{optimize()}}{ -Performs the feature selection on a \link{FSelectInstanceSingleCrit} or -\link{FSelectInstanceMultiCrit} until termination. -The single evaluations will be written into the \link{ArchiveFSelect} that resides in the -\link{FSelectInstanceSingleCrit} / \link{FSelectInstanceMultiCrit}. +Performs the feature selection on a \link{FSelectInstanceSingleCrit} or \link{FSelectInstanceMultiCrit} until termination. +The single evaluations will be written into the \link{ArchiveFSelect} that resides in the \link{FSelectInstanceSingleCrit} / \link{FSelectInstanceMultiCrit}. The result will be written into the instance object. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{FSelector$optimize(inst)}\if{html}{\out{
}} @@ -197,12 +180,12 @@ The result will be written into the instance object. \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{inst}}{(\link{FSelectInstanceSingleCrit}|\link{FSelectInstanceMultiCrit}).} +\item{\code{inst}}{(\link{FSelectInstanceSingleCrit} | \link{FSelectInstanceMultiCrit}).} } \if{html}{\out{
}} } \subsection{Returns}{ -\link[data.table:data.table]{data.table::data.table}. +\code{\link[data.table:data.table]{data.table::data.table()}}. } } \if{html}{\out{
}} diff --git a/man/ObjectiveFSelect.Rd b/man/ObjectiveFSelect.Rd index f3a82f5e..8a456fec 100644 --- a/man/ObjectiveFSelect.Rd +++ b/man/ObjectiveFSelect.Rd @@ -2,11 +2,10 @@ % Please edit documentation in R/ObjectiveFSelect.R \name{ObjectiveFSelect} \alias{ObjectiveFSelect} -\title{ObjectiveFSelect} +\title{Class for Feature Selection Objective} \description{ -Stores the objective function that estimates the performance of feature -subsets. This class is usually constructed internally by by the -\link{FSelectInstanceSingleCrit} / \link{FSelectInstanceMultiCrit}. +Stores the objective function that estimates the performance of feature subsets. +This class is usually constructed internally by by the \link{FSelectInstanceSingleCrit} / \link{FSelectInstanceMultiCrit}. } \section{Super class}{ \code{\link[bbotk:Objective]{bbotk::Objective}} -> \code{ObjectiveFSelect} @@ -14,13 +13,13 @@ subsets. This class is usually constructed internally by by the \section{Public fields}{ \if{html}{\out{
}} \describe{ -\item{\code{task}}{(\link[mlr3:Task]{mlr3::Task})} +\item{\code{task}}{(\link[mlr3:Task]{mlr3::Task}).} -\item{\code{learner}}{(\link[mlr3:Learner]{mlr3::Learner})} +\item{\code{learner}}{(\link[mlr3:Learner]{mlr3::Learner}).} -\item{\code{resampling}}{(\link[mlr3:Resampling]{mlr3::Resampling})} +\item{\code{resampling}}{(\link[mlr3:Resampling]{mlr3::Resampling}).} -\item{\code{measures}}{(list of \link[mlr3:Measure]{mlr3::Measure})} +\item{\code{measures}}{(list of \link[mlr3:Measure]{mlr3::Measure}).} \item{\code{store_models}}{(\code{logical(1)}).} @@ -52,9 +51,6 @@ subsets. This class is usually constructed internally by by the \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-ObjectiveFSelect-new}{}}} \subsection{Method \code{new()}}{ -Creates a new instance of this \link[R6:R6Class]{R6} class. - - Creates a new instance of this \link[R6:R6Class]{R6} class. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{ObjectiveFSelect$new( @@ -64,7 +60,8 @@ Creates a new instance of this \link[R6:R6Class]{R6} class. measures, check_values = TRUE, store_benchmark_result = TRUE, - store_models = FALSE + store_models = FALSE, + archive = NULL )}\if{html}{\out{
}} } @@ -95,6 +92,10 @@ Store benchmark result in archive?} \item{\code{store_models}}{(\code{logical(1)}). Store models in benchmark result?} + +\item{\code{archive}}{(\link{ArchiveFSelect})\cr +Reference to the archive of \link{FSelectInstanceSingleCrit} | \link{FSelectInstanceMultiCrit}. +If \code{NULL} (default), benchmark result and models cannot be stored.} } \if{html}{\out{
}} } diff --git a/man/auto_fselector.Rd b/man/auto_fselector.Rd index b9f1be61..2ae41534 100644 --- a/man/auto_fselector.Rd +++ b/man/auto_fselector.Rd @@ -93,12 +93,59 @@ For this reason it is not feasible to pass an instantiated \link[mlr3:Resampling } \examples{ -at = auto_fselector( - method = "random_search", +# Automatic Feature Selection +\donttest{ + +# split to train and external set +task = tsk("penguins") +split = partition(task, ratio = 0.8) + +# create auto fselector +afs = auto_fselector( + method = fs("random_search"), learner = lrn("classif.rpart"), resampling = rsmp ("holdout"), measure = msr("classif.ce"), term_evals = 4) -at$train(tsk("pima")) +# optimize feature subset and fit final model +afs$train(task, row_ids = split$train) + +# predict with final model +afs$predict(task, row_ids = split$test) + +# show result +afs$fselect_result + +# model slot contains trained learner and fselect instance +afs$model + +# shortcut trained learner +afs$learner + +# shortcut fselect instance +afs$tuning_instance + + +# Nested Resampling + +afs = auto_fselector( + method = fs("random_search"), + learner = lrn("classif.rpart"), + resampling = rsmp ("holdout"), + measure = msr("classif.ce"), + term_evals = 4) + +resampling_outer = rsmp("cv", folds = 3) +rr = resample(task, afs, resampling_outer, store_models = TRUE) + +# retrieve inner tuning results. +extract_inner_fselect_results(rr) + +# performance scores estimated on the outer resampling +rr$score() + +# unbiased performance of the final model trained on the full data set +rr$aggregate() +} } diff --git a/man/extract_inner_fselect_archives.Rd b/man/extract_inner_fselect_archives.Rd index b3a52ef1..c4738479 100644 --- a/man/extract_inner_fselect_archives.Rd +++ b/man/extract_inner_fselect_archives.Rd @@ -4,15 +4,11 @@ \alias{extract_inner_fselect_archives} \title{Extract Inner Feature Selection Archives} \usage{ -extract_inner_fselect_archives(x, unnest = NULL, exclude_columns = "uhash") +extract_inner_fselect_archives(x, exclude_columns = "uhash") } \arguments{ \item{x}{(\link[mlr3:ResampleResult]{mlr3::ResampleResult} | \link[mlr3:BenchmarkResult]{mlr3::BenchmarkResult}).} -\item{unnest}{(\code{character()})\cr -Transforms list columns to separate columns. Set to \code{NULL} if no column -should be unnested.} - \item{exclude_columns}{(\code{character()})\cr Exclude columns from result table. Set to \code{NULL} if no column should be excluded.} @@ -21,12 +17,10 @@ excluded.} \code{\link[data.table:data.table]{data.table::data.table()}}. } \description{ -Extract inner feature selection archives of nested resampling. Implemented for -\link[mlr3:ResampleResult]{mlr3::ResampleResult} and \link[mlr3:BenchmarkResult]{mlr3::BenchmarkResult}. The function iterates -over the \link{AutoFSelector} objects and binds the archives to a -\code{\link[data.table:data.table]{data.table::data.table()}}. \link{AutoFSelector} must be initialized with -\code{store_fselect_instance = TRUE} and \code{resample()} or \code{benchmark()} must be -called with \code{store_models = TRUE}. +Extract inner feature selection archives of nested resampling. +Implemented for \link[mlr3:ResampleResult]{mlr3::ResampleResult} and \link[mlr3:BenchmarkResult]{mlr3::BenchmarkResult}. +The function iterates over the \link{AutoFSelector} objects and binds the archives to a \code{\link[data.table:data.table]{data.table::data.table()}}. +\link{AutoFSelector} must be initialized with \code{store_fselect_instance = TRUE} and \code{resample()} or \code{benchmark()} must be called with \code{store_models = TRUE}. } \section{Data structure}{ @@ -57,15 +51,19 @@ Resample result of the inner resampling. } \examples{ +# Nested Resampling on Palmer Penguins Data Set + +# create auto fselector at = auto_fselector( - method = "random_search", + method = fs("random_search"), learner = lrn("classif.rpart"), resampling = rsmp ("holdout"), measure = msr("classif.ce"), term_evals = 4) resampling_outer = rsmp("cv", folds = 2) -rr = resample(tsk("iris"), at, resampling_outer, store_models = TRUE) +rr = resample(tsk("penguins"), at, resampling_outer, store_models = TRUE) +# extract inner archives extract_inner_fselect_archives(rr) } diff --git a/man/extract_inner_fselect_results.Rd b/man/extract_inner_fselect_results.Rd index 0909ce55..e6d2ee19 100644 --- a/man/extract_inner_fselect_results.Rd +++ b/man/extract_inner_fselect_results.Rd @@ -4,21 +4,28 @@ \alias{extract_inner_fselect_results} \title{Extract Inner Feature Selection Results} \usage{ -extract_inner_fselect_results(x) +extract_inner_fselect_results(x, fselect_instance, ...) } \arguments{ \item{x}{(\link[mlr3:ResampleResult]{mlr3::ResampleResult} | \link[mlr3:BenchmarkResult]{mlr3::BenchmarkResult}).} + +\item{fselect_instance}{(\code{logical(1)})\cr +If \code{TRUE}, instances are added to the table.} + +\item{...}{(any)\cr +Additional arguments.} } \value{ \code{\link[data.table:data.table]{data.table::data.table()}}. } \description{ -Extract inner feature selection results of nested resampling. Implemented for -\link[mlr3:ResampleResult]{mlr3::ResampleResult} and \link[mlr3:BenchmarkResult]{mlr3::BenchmarkResult}. The function iterates -over the \link{AutoFSelector} objects and binds the feature selection results to a -\code{\link[data.table:data.table]{data.table::data.table()}}. \link{AutoFSelector} must be initialized with -\code{store_fselect_instance = TRUE} and \code{resample()} or \code{benchmark()} must be -called with \code{store_models = TRUE}. +Extract inner feature selection results of nested resampling. +Implemented for \link[mlr3:ResampleResult]{mlr3::ResampleResult} and \link[mlr3:BenchmarkResult]{mlr3::BenchmarkResult}. +} +\details{ +The function iterates over the \link{AutoFSelector} objects and binds the feature selection results to a \code{\link[data.table:data.table]{data.table::data.table()}}. +\link{AutoFSelector} must be initialized with \code{store_fselect_instance = TRUE} and \code{resample()} or \code{benchmark()} must be called with \code{store_models = TRUE}. +Optionally, the instance can be added for each iteration. } \section{Data structure}{ @@ -40,8 +47,11 @@ Vector of selected feature set. } \examples{ +# Nested Resampling on Palmer Penguins Data Set + +# create auto fselector at = auto_fselector( - method = "random_search", + method = fs("random_search"), learner = lrn("classif.rpart"), resampling = rsmp ("holdout"), measure = msr("classif.ce"), @@ -50,5 +60,6 @@ at = auto_fselector( resampling_outer = rsmp("cv", folds = 2) rr = resample(tsk("iris"), at, resampling_outer, store_models = TRUE) +# extract inner results extract_inner_fselect_results(rr) } diff --git a/man/fs.Rd b/man/fs.Rd index 5bf5e012..be73f51e 100644 --- a/man/fs.Rd +++ b/man/fs.Rd @@ -21,12 +21,22 @@ See \code{\link[mlr3misc:dictionary_sugar_get]{mlr3misc::dictionary_sugar_get()} Keys passed to the respective \link[mlr3misc:Dictionary]{dictionary} to retrieve multiple objects.} } \value{ -\link{FSelector}. +\link[R6:R6Class]{R6::R6Class} object of the respective type, or a list of \link[R6:R6Class]{R6::R6Class} objects for the plural versions. } \description{ -This function complements \link{mlr_fselectors} with functions in the spirit -of \link[mlr3:mlr_sugar]{mlr3::mlr_sugar}. +Functions to retrieve objects, set parameters and assign to fields in one go. +Relies on \code{\link[mlr3misc:dictionary_sugar_get]{mlr3misc::dictionary_sugar_get()}} to extract objects from the respective \link[mlr3misc:Dictionary]{mlr3misc::Dictionary}: +\itemize{ +\item \code{fs()} for a \link{FSelector} from \link{mlr_fselectors}. +\item \code{fss()} for a list of \link[=FSelector]{FSelectors} from \link{mlr_fselectors}. +\item \code{trm()} for a \link{Terminator} from \link{mlr_terminators}. +\item \code{trms()} for a list of \link[=Terminator]{Terminators} from \link{mlr_terminators}. +} } \examples{ -fs("sequential", max_features = 4) +# random search with batch size of 5 +fs("random_search", batch_size = 5) + +# run time terminator with 20 seconds +trm("run_time", secs = 20) } diff --git a/man/fselect.Rd b/man/fselect.Rd index f76b9142..0795ef6c 100644 --- a/man/fselect.Rd +++ b/man/fselect.Rd @@ -9,10 +9,13 @@ fselect( task, learner, resampling, - measures, + measures = NULL, term_evals = NULL, term_time = NULL, + terminator = NULL, + store_benchmark_result = TRUE, store_models = FALSE, + check_values = FALSE, ... ) } @@ -31,9 +34,9 @@ Resampling that is used to evaluated the performance of the feature subsets. Uninstantiated resamplings are instantiated during construction so that all feature subsets are evaluated on the same data splits. Already instantiated resamplings are kept unchanged.} -\item{measures}{(list of \link[mlr3:Measure]{mlr3::Measure})\cr -Measures to optimize. -If \code{NULL}, \CRANpkg{mlr3}'s default measure is used.} +\item{measures}{(\link[mlr3:Measure]{mlr3::Measure} or list of \link[mlr3:Measure]{mlr3::Measure})\cr +A single measure creates a \link{FSelectInstanceSingleCrit} and multiple measures a \link{FSelectInstanceMultiCrit}. +If \code{NULL}, default measure is used.} \item{term_evals}{(\code{integer(1)})\cr Number of allowed evaluations.} @@ -41,29 +44,80 @@ Number of allowed evaluations.} \item{term_time}{(\code{integer(1)})\cr Maximum allowed time in seconds.} +\item{terminator}{(\link{Terminator})\cr +Stop criterion of the feature selection.} + +\item{store_benchmark_result}{(\code{logical(1)})\cr +Store benchmark result in archive?} + \item{store_models}{(\code{logical(1)}). Store models in benchmark result?} +\item{check_values}{(\code{logical(1)})\cr +Check the parameters before the evaluation and the results for +validity?} + \item{...}{(named \code{list()})\cr Named arguments to be set as parameters of the fselector.} } \value{ -\code{FSelectInstanceSingleCrit} | \code{FSelectInstanceMultiCrit} +\link{FSelectInstanceSingleCrit} | \link{FSelectInstanceMultiCrit} } \description{ -Function to optimize the feature set of a \link[mlr3:Learner]{mlr3::Learner}. +Function to optimize the features of a \link[mlr3:Learner]{mlr3::Learner}. +The function internally creates a \link{FSelectInstanceSingleCrit} or \link{FSelectInstanceMultiCrit} which describe the feature selection problem. +It executes the feature selection with the \link{FSelector} (\code{method}) and returns the result with the fselect instance (\verb{$result}). +The \link{ArchiveFSelect} (\verb{$archive}) stores all evaluated hyperparameter configurations and performance scores. +} +\details{ +The \link[mlr3:Task]{mlr3::Task}, \link[mlr3:Learner]{mlr3::Learner}, \link[mlr3:Resampling]{mlr3::Resampling}, \link[mlr3:Measure]{mlr3::Measure} and \link{Terminator} are used to construct a \link{FSelectInstanceSingleCrit}. +If multiple performance \link[=Measure]{Measures} are supplied, a \link{FSelectInstanceMultiCrit} is created. +The parameter \code{term_evals} and \code{term_time} are shortcuts to create a \link{Terminator}. +If both parameters are passed, a \link{TerminatorCombo} is constructed. +For other \link[=Terminator]{Terminators}, pass one with \code{terminator}. +If no termination criterion is needed, set \code{term_evals}, \code{term_time} and \code{terminator} to \code{NULL}. } +\section{Resources}{ + +\itemize{ +\item \href{https://mlr3book.mlr-org.com/feature-selection.html#fs-wrapper}{book chapter} on feature selection. +\item \href{https://mlr-org.com/gallery/2020-09-14-mlr3fselect-basic/}{gallery post} on feature selection on the Titanic data set. +} +} + +\section{Analysis}{ + +For analyzing the feature selection results, it is recommended to pass the archive to \code{as.data.table()}. +The returned data table is joined with the benchmark result which adds the \link[mlr3:ResampleResult]{mlr3::ResampleResult} for each feature set. + +The archive provides various getters (e.g. \verb{$learners()}) to ease the access. +All getters extract by position (\code{i}) or unique hash (\code{uhash}). +For a complete list of all getters see the methods section. + +The benchmark result (\verb{$benchmark_result}) allows to score the feature sets again on a different measure. +Alternatively, measures can be supplied to \code{as.data.table()}. +} + \examples{ +# Feature selection on the Palmer Penguins data set task = tsk("pima") +learner = lrn("classif.rpart") +# Run feature selection instance = fselect( method = "random_search", task = task, - learner = lrn("classif.rpart"), + learner = learner, resampling = rsmp ("holdout"), measures = msr("classif.ce"), term_evals = 4) -# subset task to optimized feature set +# Subset task to optimized feature set task$select(instance$result_feature_set) + +# Train the learner with optimal feature set on the full data set +learner$train(task) + +# Inspect all evaluated configurations +as.data.table(instance$archive) } diff --git a/man/fselect_nested.Rd b/man/fselect_nested.Rd index 1aa56610..93a69fe3 100644 --- a/man/fselect_nested.Rd +++ b/man/fselect_nested.Rd @@ -51,18 +51,19 @@ Named arguments to be set as parameters of the fselector.} Function to conduct nested resampling. } \examples{ +# Nested resampling on Palmer Penguins data set rr = fselect_nested( method = "random_search", - task = tsk("pima"), + task = tsk("penguins"), learner = lrn("classif.rpart"), inner_resampling = rsmp ("holdout"), outer_resampling = rsmp("cv", folds = 2), measure = msr("classif.ce"), term_evals = 4) -# performance scores estimated on the outer resampling +# Performance scores estimated on the outer resampling rr$score() -# unbiased performance of the final model trained on the full data set +# Unbiased performance of the final model trained on the full data set rr$aggregate() } diff --git a/man/fsi.Rd b/man/fsi.Rd new file mode 100644 index 00000000..76d2412d --- /dev/null +++ b/man/fsi.Rd @@ -0,0 +1,86 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sugar.R +\name{fsi} +\alias{fsi} +\title{Syntactic Sugar for Instance Construction} +\usage{ +fsi( + task, + learner, + resampling, + measures = NULL, + terminator, + store_benchmark_result = TRUE, + store_models = FALSE, + check_values = FALSE +) +} +\arguments{ +\item{task}{(\link[mlr3:Task]{mlr3::Task})\cr +Task to operate on.} + +\item{learner}{(\link[mlr3:Learner]{mlr3::Learner})\cr +Learner to optimize the feature subset for.} + +\item{resampling}{(\link[mlr3:Resampling]{mlr3::Resampling})\cr +Resampling that is used to evaluated the performance of the feature subsets. +Uninstantiated resamplings are instantiated during construction so that all feature subsets are evaluated on the same data splits. +Already instantiated resamplings are kept unchanged.} + +\item{measures}{(\link[mlr3:Measure]{mlr3::Measure} or list of \link[mlr3:Measure]{mlr3::Measure})\cr +A single measure creates a \link{FSelectInstanceSingleCrit} and multiple measures a \link{FSelectInstanceMultiCrit}. +If \code{NULL}, default measure is used.} + +\item{terminator}{(\link{Terminator})\cr +Stop criterion of the feature selection.} + +\item{store_benchmark_result}{(\code{logical(1)})\cr +Store benchmark result in archive?} + +\item{store_models}{(\code{logical(1)}). +Store models in benchmark result?} + +\item{check_values}{(\code{logical(1)})\cr +Check the parameters before the evaluation and the results for +validity?} +} +\description{ +Function to construct a \link{FSelectInstanceSingleCrit} or \link{FSelectInstanceMultiCrit}. +} +\section{Resources}{ + +\itemize{ +\item \href{https://mlr3book.mlr-org.com/feature-selection.html#fs-wrapper}{book chapter} on feature selection. +\item \href{https://mlr-org.com/gallery/2020-09-14-mlr3fselect-basic/}{gallery post} on feature selection on the Titanic data set. +} +} + +\examples{ +# Feature selection on Palmer Penguins data set +task = tsk("penguins") +learner = lrn("classif.rpart") + +# Construct feature selection instance +instance = fsi( + task = task, + learner = learner, + resampling = rsmp("cv", folds = 3), + measures = msr("classif.ce"), + terminator = trm("evals", n_evals = 4) +) + +# Choose optimization algorithm +fselector = fs("random_search", batch_size = 2) + +# Run feature selection +fselector$optimize(instance) + +# Subset task to optimal feature set +task$select(instance$result_feature_set) + +# Train the learner with optimal feature set on the full data set +learner$train(task) + +# Inspect all evaluated sets +as.data.table(instance$archive) +} diff --git a/man/mlr_fselectors.Rd b/man/mlr_fselectors.Rd index 897cad12..e3c85d32 100644 --- a/man/mlr_fselectors.Rd +++ b/man/mlr_fselectors.Rd @@ -35,6 +35,15 @@ fs("random_search") } \seealso{ Sugar functions: \code{\link[=fs]{fs()}}, \code{\link[=fss]{fss()}} + +Other FSelector: +\code{\link{mlr_fselectors_design_points}}, +\code{\link{mlr_fselectors_exhaustive_search}}, +\code{\link{mlr_fselectors_genetic_search}}, +\code{\link{mlr_fselectors_random_search}}, +\code{\link{mlr_fselectors_rfe}}, +\code{\link{mlr_fselectors_sequential}}, +\code{\link{mlr_fselectors_shadow_variable_search}} } \concept{Dictionary} \concept{FSelector} diff --git a/man/mlr_fselectors_design_points.Rd b/man/mlr_fselectors_design_points.Rd index 64368b1d..5daa0f68 100644 --- a/man/mlr_fselectors_design_points.Rd +++ b/man/mlr_fselectors_design_points.Rd @@ -3,21 +3,21 @@ \name{mlr_fselectors_design_points} \alias{mlr_fselectors_design_points} \alias{FSelectorDesignPoints} -\title{Feature Selection via Design Points} +\title{Feature Selection with Design Points} \description{ -Design points uses feature sets specified by the user. - +Feature selection using user-defined feature sets. +} +\details{ The feature sets are evaluated in order as given. + The feature selection terminates itself when all feature sets are evaluated. It is not necessary to set a termination criterion. } \section{Dictionary}{ -This \link{FSelector} can be instantiated via the \link[mlr3misc:Dictionary]{dictionary} -\link{mlr_fselectors} or with the associated sugar function \code{\link[=fs]{fs()}}: +This \link{FSelector} can be instantiated with the associated sugar function \code{\link[=fs]{fs()}}: -\if{html}{\out{
}}\preformatted{mlr_fselectors$get("design_points") -fs("design_points") +\if{html}{\out{
}}\preformatted{fs("design_points") }\if{html}{\out{
}} } @@ -34,16 +34,15 @@ Design points to try in search, one per row.} } \examples{ -library(mlr3misc) +# Feature Selection +\donttest{ -# retrieve task +# retrieve task and load learner task = tsk("pima") - -# load learner learner = lrn("classif.rpart") # create design -design = rowwise_table( +design = mlr3misc::rowwise_table( ~age, ~glucose, ~insulin, ~mass, ~pedigree, ~pregnant, ~pressure, ~triceps, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, @@ -51,21 +50,19 @@ design = rowwise_table( TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE ) -\donttest{ -# feature selection on the pima indians diabetes data set +# run feature selection on the Pima Indians diabetes data set instance = fselect( - method = "design_points", + method = fs("design_points", design = design), task = task, learner = learner, - resampling = rsmp("cv", folds = 3), - measure = msr("classif.ce"), - design = design + resampling = rsmp("holdout"), + measure = msr("classif.ce") ) -# best performing feature subset +# best performing feature set instance$result -# all evaluated feature subsets +# all evaluated feature sets as.data.table(instance$archive) # subset the task and fit the final model @@ -73,6 +70,17 @@ task$select(instance$result_feature_set) learner$train(task) } } +\seealso{ +Other FSelector: +\code{\link{mlr_fselectors_exhaustive_search}}, +\code{\link{mlr_fselectors_genetic_search}}, +\code{\link{mlr_fselectors_random_search}}, +\code{\link{mlr_fselectors_rfe}}, +\code{\link{mlr_fselectors_sequential}}, +\code{\link{mlr_fselectors_shadow_variable_search}}, +\code{\link{mlr_fselectors}} +} +\concept{FSelector} \section{Super classes}{ \code{\link[mlr3fselect:FSelector]{mlr3fselect::FSelector}} -> \code{\link[mlr3fselect:FSelectorFromOptimizer]{mlr3fselect::FSelectorFromOptimizer}} -> \code{FSelectorDesignPoints} } diff --git a/man/mlr_fselectors_exhaustive_search.Rd b/man/mlr_fselectors_exhaustive_search.Rd index af39343a..107f910c 100644 --- a/man/mlr_fselectors_exhaustive_search.Rd +++ b/man/mlr_fselectors_exhaustive_search.Rd @@ -3,40 +3,41 @@ \name{mlr_fselectors_exhaustive_search} \alias{mlr_fselectors_exhaustive_search} \alias{FSelectorExhaustiveSearch} -\title{Feature Selection via Exhaustive Search} +\title{Feature Selection with Exhaustive Search} \description{ -Exhaustive search generates all possible feature sets. - +Feature Selection using the Exhaustive Search Algorithm. +Exhaustive Search generates all possible feature sets. +} +\details{ The feature selection terminates itself when all feature sets are evaluated. It is not necessary to set a termination criterion. } \section{Dictionary}{ -This \link{FSelector} can be instantiated via the \link[mlr3misc:Dictionary]{dictionary} -\link{mlr_fselectors} or with the associated sugar function \code{\link[=fs]{fs()}}: +This \link{FSelector} can be instantiated with the associated sugar function \code{\link[=fs]{fs()}}: -\if{html}{\out{
}}\preformatted{mlr_fselectors$get("exhaustive_search") -fs("exhaustive_search") +\if{html}{\out{
}}\preformatted{fs("exhaustive_search") }\if{html}{\out{
}} } -\section{Parameters}{ +\section{Control Parameters}{ \describe{ \item{\code{max_features}}{\code{integer(1)}\cr -Maximum number of features. By default, number of features in \link[mlr3:Task]{mlr3::Task}.} +Maximum number of features. +By default, number of features in \link[mlr3:Task]{mlr3::Task}.} } } \examples{ -# retrieve task -task = tsk("pima") +# Feature Selection +\donttest{ -# load learner +# retrieve task and load learner +task = tsk("penguins") learner = lrn("classif.rpart") -\donttest{ -# feature selection on the pima indians diabetes data set +# run feature selection on the Palmer Penguins data set instance = fselect( method = "exhaustive_search", task = task, @@ -46,10 +47,10 @@ instance = fselect( term_evals = 10 ) -# best performing feature subset +# best performing feature set instance$result -# all evaluated feature subsets +# all evaluated feature sets as.data.table(instance$archive) # subset the task and fit the final model @@ -57,6 +58,17 @@ task$select(instance$result_feature_set) learner$train(task) } } +\seealso{ +Other FSelector: +\code{\link{mlr_fselectors_design_points}}, +\code{\link{mlr_fselectors_genetic_search}}, +\code{\link{mlr_fselectors_random_search}}, +\code{\link{mlr_fselectors_rfe}}, +\code{\link{mlr_fselectors_sequential}}, +\code{\link{mlr_fselectors_shadow_variable_search}}, +\code{\link{mlr_fselectors}} +} +\concept{FSelector} \section{Super class}{ \code{\link[mlr3fselect:FSelector]{mlr3fselect::FSelector}} -> \code{FSelectorExhaustiveSearch} } diff --git a/man/mlr_fselectors_genetic_search.Rd b/man/mlr_fselectors_genetic_search.Rd index 956c492b..19b49021 100644 --- a/man/mlr_fselectors_genetic_search.Rd +++ b/man/mlr_fselectors_genetic_search.Rd @@ -3,32 +3,19 @@ \name{mlr_fselectors_genetic_search} \alias{mlr_fselectors_genetic_search} \alias{FSelectorGeneticSearch} -\title{Feature Selection via Genetic Search} +\title{Feature Selection with Genetic Search} \description{ -Genetic search imitates the process of natural selection to generate feature sets. - -Calls \code{\link[genalg:rbga.bin]{genalg::rbga.bin()}} from package \CRANpkg{genalg}. +Feature selection using the Genetic Algorithm from the package \CRANpkg{genalg}. } \section{Dictionary}{ -This \link{FSelector} can be instantiated via the \link[mlr3misc:Dictionary]{dictionary} -\link{mlr_fselectors} or with the associated sugar function \code{\link[=fs]{fs()}}: +This \link{FSelector} can be instantiated with the associated sugar function \code{\link[=fs]{fs()}}: -\if{html}{\out{
}}\preformatted{mlr_fselectors$get("genetic_search") -fs("genetic_search") +\if{html}{\out{
}}\preformatted{fs("genetic_search") }\if{html}{\out{
}} } -\section{Parameters}{ - -\describe{ -\item{\code{suggestions}}{\code{list()}} -\item{\code{popSize}}{\code{integer(1)}} -\item{\code{mutationChance}}{\code{numeric(1)}} -\item{\code{elitism}}{\code{integer(1)}} -\item{\code{zeroToOneRatio}}{\code{integer(1)}} -\item{\code{iters}}{\code{integer(1)}} -} +\section{Control Parameters}{ For the meaning of the control parameters, see \code{\link[genalg:rbga.bin]{genalg::rbga.bin()}}. \code{\link[genalg:rbga.bin]{genalg::rbga.bin()}} internally terminates after \code{iters} iteration. @@ -37,14 +24,14 @@ If more iterations are needed, set \code{ìters} to a higher value in the parame } \examples{ -# retrieve task -task = tsk("pima") +# Feature Selection +\donttest{ -# load learner +# retrieve task and load learner +task = tsk("penguins") learner = lrn("classif.rpart") -\donttest{ -# feature selection on the pima indians diabetes data set +# run feature selection on the Palmer Penguins data set instance = fselect( method = "genetic_search", task = task, @@ -54,10 +41,10 @@ instance = fselect( term_evals = 10 ) -# best performing feature subset +# best performing feature set instance$result -# all evaluated feature subsets +# all evaluated feature sets as.data.table(instance$archive) # subset the task and fit the final model @@ -65,6 +52,17 @@ task$select(instance$result_feature_set) learner$train(task) } } +\seealso{ +Other FSelector: +\code{\link{mlr_fselectors_design_points}}, +\code{\link{mlr_fselectors_exhaustive_search}}, +\code{\link{mlr_fselectors_random_search}}, +\code{\link{mlr_fselectors_rfe}}, +\code{\link{mlr_fselectors_sequential}}, +\code{\link{mlr_fselectors_shadow_variable_search}}, +\code{\link{mlr_fselectors}} +} +\concept{FSelector} \section{Super class}{ \code{\link[mlr3fselect:FSelector]{mlr3fselect::FSelector}} -> \code{FSelectorGeneticSearch} } diff --git a/man/mlr_fselectors_random_search.Rd b/man/mlr_fselectors_random_search.Rd index c70c4c34..0c837e9e 100644 --- a/man/mlr_fselectors_random_search.Rd +++ b/man/mlr_fselectors_random_search.Rd @@ -3,7 +3,7 @@ \name{mlr_fselectors_random_search} \alias{mlr_fselectors_random_search} \alias{FSelectorRandomSearch} -\title{Feature Selection via Random Search} +\title{Feature Selection with Random Search} \source{ Bergstra J, Bengio Y (2012). \dQuote{Random Search for Hyper-Parameter Optimization.} @@ -11,47 +11,48 @@ Bergstra J, Bengio Y (2012). \url{https://jmlr.csail.mit.edu/papers/v13/bergstra12a.html}. } \description{ -Random search randomly draws feature sets. - -Feature sets are evaluated in batches of size \code{batch_size}. +Feature selection using Random Search Algorithm. +} +\details{ +The feature sets are randomly drawn. +The sets are evaluated in batches of size \code{batch_size}. Larger batches mean we can parallelize more, smaller batches imply a more fine-grained checking of termination criteria. } \section{Dictionary}{ -This \link{FSelector} can be instantiated via the \link[mlr3misc:Dictionary]{dictionary} -\link{mlr_fselectors} or with the associated sugar function \code{\link[=fs]{fs()}}: +This \link{FSelector} can be instantiated with the associated sugar function \code{\link[=fs]{fs()}}: -\if{html}{\out{
}}\preformatted{mlr_fselectors$get("random_search") -fs("random_search") +\if{html}{\out{
}}\preformatted{fs("random_search") }\if{html}{\out{
}} } -\section{Parameters}{ +\section{Control Parameters}{ \describe{ \item{\code{max_features}}{\code{integer(1)}\cr -Maximum number of features. By default, number of features in \link[mlr3:Task]{mlr3::Task}.} +Maximum number of features. +By default, number of features in \link[mlr3:Task]{mlr3::Task}.} \item{\code{batch_size}}{\code{integer(1)}\cr Maximum number of feature sets to try in a batch.} } } \examples{ -# retrieve task -task = tsk("pima") +# Feature Selection +\donttest{ -# load learner +# retrieve task and load learner +task = tsk("penguins") learner = lrn("classif.rpart") -\donttest{ -# feature selection on the pima indians diabetes data set +# run feature selection on the Palmer Penguins data set instance = fselect( - method = "random_search", + method = fs("random_search"), task = task, learner = learner, resampling = rsmp("holdout"), measure = msr("classif.ce"), - term_evals = 100 + term_evals = 10 ) # best performing feature subset @@ -65,6 +66,17 @@ task$select(instance$result_feature_set) learner$train(task) } } +\seealso{ +Other FSelector: +\code{\link{mlr_fselectors_design_points}}, +\code{\link{mlr_fselectors_exhaustive_search}}, +\code{\link{mlr_fselectors_genetic_search}}, +\code{\link{mlr_fselectors_rfe}}, +\code{\link{mlr_fselectors_sequential}}, +\code{\link{mlr_fselectors_shadow_variable_search}}, +\code{\link{mlr_fselectors}} +} +\concept{FSelector} \section{Super class}{ \code{\link[mlr3fselect:FSelector]{mlr3fselect::FSelector}} -> \code{FSelectorRandomSearch} } diff --git a/man/mlr_fselectors_rfe.Rd b/man/mlr_fselectors_rfe.Rd index 604556cb..321306d1 100644 --- a/man/mlr_fselectors_rfe.Rd +++ b/man/mlr_fselectors_rfe.Rd @@ -3,8 +3,9 @@ \name{mlr_fselectors_rfe} \alias{mlr_fselectors_rfe} \alias{FSelectorRFE} -\title{Feature Selection via Recursive Feature Elimination} +\title{Feature Selection with Recursive Feature Elimination} \description{ +Feature selection using the Recursive Feature Elimination Algorithm (RFE). Recursive feature elimination iteratively removes features with a low importance score. Only works with \link{Learner}s that can calculate importance scores (see section on optional extractors in \link{Learner}). } @@ -19,21 +20,21 @@ It is not necessary to set a termination criterion. } \section{Dictionary}{ -This \link{FSelector} can be instantiated via the \link[mlr3misc:Dictionary]{dictionary} -\link{mlr_fselectors} or with the associated sugar function \code{\link[=fs]{fs()}}: +This \link{FSelector} can be instantiated with the associated sugar function \code{\link[=fs]{fs()}}: -\if{html}{\out{
}}\preformatted{mlr_fselectors$get("rfe") -fs("rfe") +\if{html}{\out{
}}\preformatted{fs("rfe") }\if{html}{\out{
}} } -\section{Parameters}{ +\section{Control Parameters}{ \describe{ \item{\code{n_features}}{\code{integer(1)}\cr -The number of features to select. By default half of the features are selected.} +The number of features to select. +By default half of the features are selected.} \item{\code{feature_fraction}}{\code{double(1)}\cr -Fraction of features to retain in each iteration, The default 0.5 retrains half of the features.} +Fraction of features to retain in each iteration. +The default 0.5 retrains half of the features.} \item{\code{feature_number}}{\code{integer(1)}\cr Number of features to remove in each iteration.} \item{\code{subset_sizes}}{\code{integer()}\cr @@ -47,16 +48,16 @@ The parameter \code{feature_fraction}, \code{feature_number} and \code{subset_si } \examples{ -# retrieve task -task = tsk("pima") +# Feature Selection +\donttest{ -# load learner +# retrieve task and load learner +task = tsk("penguins") learner = lrn("classif.rpart") -\donttest{ -# feature selection on the pima indians diabetes data set +# run feature selection on the Palmer Penguins data set instance = fselect( - method = "rfe", + method = fs("rfe"), task = task, learner = learner, resampling = rsmp("holdout"), @@ -75,6 +76,17 @@ task$select(instance$result_feature_set) learner$train(task) } } +\seealso{ +Other FSelector: +\code{\link{mlr_fselectors_design_points}}, +\code{\link{mlr_fselectors_exhaustive_search}}, +\code{\link{mlr_fselectors_genetic_search}}, +\code{\link{mlr_fselectors_random_search}}, +\code{\link{mlr_fselectors_sequential}}, +\code{\link{mlr_fselectors_shadow_variable_search}}, +\code{\link{mlr_fselectors}} +} +\concept{FSelector} \section{Super class}{ \code{\link[mlr3fselect:FSelector]{mlr3fselect::FSelector}} -> \code{FSelectorRFE} } diff --git a/man/mlr_fselectors_sequential.Rd b/man/mlr_fselectors_sequential.Rd index 40f555f9..ec746ec1 100644 --- a/man/mlr_fselectors_sequential.Rd +++ b/man/mlr_fselectors_sequential.Rd @@ -3,10 +3,11 @@ \name{mlr_fselectors_sequential} \alias{mlr_fselectors_sequential} \alias{FSelectorSequential} -\title{Feature Selection via Sequential Search} +\title{Feature Selection with Sequential Search} \description{ -Sequential search iteratively adds features to the set. - +Feature selection using Sequential Search Algorithm. +} +\details{ Sequential forward selection (\code{strategy = fsf}) extends the feature set in each iteration with the feature that increases the models performance the most. Sequential backward selection (\code{strategy = fsb}) follows the same idea but starts with all features and removes features from the set. @@ -15,15 +16,13 @@ It is not necessary to set a termination criterion. } \section{Dictionary}{ -This \link{FSelector} can be instantiated via the \link[mlr3misc:Dictionary]{dictionary} -\link{mlr_fselectors} or with the associated sugar function \code{\link[=fs]{fs()}}: +This \link{FSelector} can be instantiated with the associated sugar function \code{\link[=fs]{fs()}}: -\if{html}{\out{
}}\preformatted{mlr_fselectors$get("sequential") -fs("sequential") +\if{html}{\out{
}}\preformatted{fs("sequential") }\if{html}{\out{
}} } -\section{Parameters}{ +\section{Control Parameters}{ \describe{ \item{\code{min_features}}{\code{integer(1)}\cr @@ -36,14 +35,14 @@ Search method \code{sfs} (forward search) or \code{sbs} (backward search).} } \examples{ -# retrieve task -task = tsk("pima") +# Feature Selection +\donttest{ -# load learner +# retrieve task and load learner +task = tsk("penguins") learner = lrn("classif.rpart") -\donttest{ -# feature selection on the pima indians diabetes data set +# run feature selection on the Palmer Penguins data set instance = fselect( method = "sequential", task = task, @@ -53,10 +52,10 @@ instance = fselect( term_evals = 10 ) -# best performing feature subset +# best performing feature set instance$result -# all evaluated feature subsets +# all evaluated feature sets as.data.table(instance$archive) # subset the task and fit the final model @@ -64,6 +63,17 @@ task$select(instance$result_feature_set) learner$train(task) } } +\seealso{ +Other FSelector: +\code{\link{mlr_fselectors_design_points}}, +\code{\link{mlr_fselectors_exhaustive_search}}, +\code{\link{mlr_fselectors_genetic_search}}, +\code{\link{mlr_fselectors_random_search}}, +\code{\link{mlr_fselectors_rfe}}, +\code{\link{mlr_fselectors_shadow_variable_search}}, +\code{\link{mlr_fselectors}} +} +\concept{FSelector} \section{Super class}{ \code{\link[mlr3fselect:FSelector]{mlr3fselect::FSelector}} -> \code{FSelectorSequential} } diff --git a/man/mlr_fselectors_shadow_variable_search.Rd b/man/mlr_fselectors_shadow_variable_search.Rd index 6f86edff..f31503a7 100644 --- a/man/mlr_fselectors_shadow_variable_search.Rd +++ b/man/mlr_fselectors_shadow_variable_search.Rd @@ -3,7 +3,7 @@ \name{mlr_fselectors_shadow_variable_search} \alias{mlr_fselectors_shadow_variable_search} \alias{FSelectorShadowVariableSearch} -\title{Feature Selection via Shadow Variable Search} +\title{Feature Selection with Shadow Variable Search} \source{ Thomas J, Hepp T, Mayr A, Bischl B (2017). \dQuote{Probing for Sparse and Fast Variable Selection with Model-Based Boosting.} @@ -16,32 +16,32 @@ Wu Y, Boos DD, Stefanski LA (2007). \doi{10.1198/016214506000000843}. } \description{ +Feature selection using the Shadow Variable Search Algorithm. Shadow variable search creates for each feature a permutated copy and stops when one of them is selected. - +} +\details{ The feature selection terminates itself when the first shadow variable is selected. It is not necessary to set a termination criterion. } \section{Dictionary}{ -This \link{FSelector} can be instantiated via the \link[mlr3misc:Dictionary]{dictionary} -\link{mlr_fselectors} or with the associated sugar function \code{\link[=fs]{fs()}}: +This \link{FSelector} can be instantiated with the associated sugar function \code{\link[=fs]{fs()}}: -\if{html}{\out{
}}\preformatted{mlr_fselectors$get("shadow_variable_search") -fs("shadow_variable_search") +\if{html}{\out{
}}\preformatted{fs("shadow_variable_search") }\if{html}{\out{
}} } \examples{ -# retrieve task -task = tsk("pima") +# Feature Selection +\donttest{ -# load learner +# retrieve task and load learner +task = tsk("penguins") learner = lrn("classif.rpart") -\donttest{ -# feature selection on the pima indians diabetes data set +# run feature selection on the Palmer Penguins data set instance = fselect( - method = "shadow_variable_search", + method = fs("shadow_variable_search"), task = task, learner = learner, resampling = rsmp("holdout"), @@ -59,6 +59,17 @@ task$select(instance$result_feature_set) learner$train(task) } } +\seealso{ +Other FSelector: +\code{\link{mlr_fselectors_design_points}}, +\code{\link{mlr_fselectors_exhaustive_search}}, +\code{\link{mlr_fselectors_genetic_search}}, +\code{\link{mlr_fselectors_random_search}}, +\code{\link{mlr_fselectors_rfe}}, +\code{\link{mlr_fselectors_sequential}}, +\code{\link{mlr_fselectors}} +} +\concept{FSelector} \section{Super class}{ \code{\link[mlr3fselect:FSelector]{mlr3fselect::FSelector}} -> \code{FSelectorShadowVariableSearch} } diff --git a/tests/testthat/test_ArchiveFSelect.R b/tests/testthat/test_ArchiveFSelect.R index 2d3cb3a5..b43e506f 100644 --- a/tests/testthat/test_ArchiveFSelect.R +++ b/tests/testthat/test_ArchiveFSelect.R @@ -60,45 +60,45 @@ test_that("ArchiveFSelect as.data.table function works", { # default tab = as.data.table(instance$archive) - expect_data_table(tab, nrows = 4, ncols = 13) + expect_data_table(tab, nrows = 4, ncols = 15) expect_named(tab, c("age", "glucose", "insulin", "mass", "pedigree", "pregnant", "pressure", "triceps", "classif.ce", - "runtime_learners", "timestamp", "batch_nr", "resample_result")) + "runtime_learners", "timestamp", "batch_nr", "warnings", "errors", "resample_result")) # extra measure tab = as.data.table(instance$archive, measures = msr("classif.acc")) - expect_data_table(tab, nrows = 4, ncols = 14) + expect_data_table(tab, nrows = 4, ncols = 16) expect_named(tab, c("age", "glucose", "insulin", "mass", "pedigree", "pregnant", "pressure", "triceps", "classif.ce", - "classif.acc", "runtime_learners", "timestamp", "batch_nr", "resample_result")) + "classif.acc", "runtime_learners", "timestamp", "batch_nr", "warnings", "errors", "resample_result")) # extra measures tab = as.data.table(instance$archive, measures = msrs(c("classif.acc", "classif.mcc"))) - expect_data_table(tab, nrows = 4, ncols = 15) + expect_data_table(tab, nrows = 4, ncols = 17) expect_named(tab, c("age", "glucose", "insulin", "mass", "pedigree", "pregnant", "pressure", "triceps", "classif.ce", - "classif.acc", "classif.mcc", "runtime_learners", "timestamp", "batch_nr", "resample_result")) + "classif.acc", "classif.mcc", "runtime_learners", "timestamp", "batch_nr", "warnings", "errors", "resample_result")) # exclude column tab = as.data.table(instance$archive, exclude_columns = "timestamp") - expect_data_table(tab, nrows = 4, ncols = 13) + expect_data_table(tab, nrows = 4, ncols = 15) expect_named(tab, c("age", "glucose", "insulin", "mass", "pedigree", "pregnant", "pressure", "triceps", "classif.ce", - "runtime_learners", "batch_nr", "uhash", "resample_result")) + "runtime_learners", "batch_nr", "uhash", "warnings", "errors", "resample_result")) # exclude columns tab = as.data.table(instance$archive, exclude_columns = c("timestamp", "uhash")) - expect_data_table(tab, nrows = 4, ncols = 12) + expect_data_table(tab, nrows = 4, ncols = 14) expect_named(tab, c("age", "glucose", "insulin", "mass", "pedigree", "pregnant", "pressure", "triceps", "classif.ce", - "runtime_learners", "batch_nr", "resample_result")) + "runtime_learners", "batch_nr", "warnings", "errors", "resample_result")) # no exclude tab = as.data.table(instance$archive, exclude_columns = NULL) - expect_data_table(tab, nrows = 4, ncols = 14) + expect_data_table(tab, nrows = 4, ncols = 16) expect_named(tab, c("age", "glucose", "insulin", "mass", "pedigree", "pregnant", "pressure", "triceps", "classif.ce", - "runtime_learners", "timestamp", "batch_nr", "uhash", "resample_result")) + "runtime_learners", "timestamp", "batch_nr", "uhash", "warnings", "errors", "resample_result")) # no unnest tab = as.data.table(instance$archive, unnest = NULL) - expect_data_table(tab, nrows = 4, ncols = 13) + expect_data_table(tab, nrows = 4, ncols = 15) expect_named(tab, c("age", "glucose", "insulin", "mass", "pedigree", "pregnant", "pressure", "triceps", "classif.ce", - "runtime_learners", "timestamp", "batch_nr", "resample_result")) + "runtime_learners", "timestamp", "batch_nr", "warnings", "errors", "resample_result")) # without benchmark result instance = FSelectInstanceSingleCrit$new( @@ -112,9 +112,9 @@ test_that("ArchiveFSelect as.data.table function works", { fselector$optimize(instance) tab = as.data.table(instance$archive) - expect_data_table(tab, nrows = 4, ncols = 12) + expect_data_table(tab, nrows = 4, ncols = 14) expect_named(tab, c("age", "glucose", "insulin", "mass", "pedigree", "pregnant", "pressure", "triceps", "classif.ce", - "runtime_learners", "timestamp", "batch_nr")) + "runtime_learners", "timestamp", "batch_nr", "warnings", "errors")) # empty archive instance = FSelectInstanceSingleCrit$new( diff --git a/tests/testthat/test_AutoFSelector.R b/tests/testthat/test_AutoFSelector.R index 4bdc66bc..f6bb95e6 100644 --- a/tests/testthat/test_AutoFSelector.R +++ b/tests/testthat/test_AutoFSelector.R @@ -93,7 +93,7 @@ test_that("store_fselect_instance, store_benchmark_result and store_models flags at$train(task) expect_r6(at$fselect_instance, "FSelectInstanceSingleCrit") - expect_null(at$fselect_instance$archive$benchmark_result) + expect_equal(at$fselect_instance$archive$benchmark_result$n_resample_results, 0L) at = AutoFSelector$new(lrn("classif.rpart"), rsmp("holdout"), ms, te, fselector = fselector, store_fselect_instance = FALSE, store_benchmark_result = FALSE, diff --git a/tests/testthat/test_ObjectiveFSelect.R b/tests/testthat/test_ObjectiveFSelect.R index 1ee4aa43..30457810 100644 --- a/tests/testthat/test_ObjectiveFSelect.R +++ b/tests/testthat/test_ObjectiveFSelect.R @@ -4,15 +4,16 @@ test_that("ObjectiveFSelect", { resampling = rsmp("holdout") measures = msr("dummy") - obj = ObjectiveFSelect$new(task = task, learner = learner, - resampling = resampling, measures = measures, store_models = TRUE) + archive = ArchiveFSelect$new(search_space = task_to_domain(task), codomain = measures_to_codomain(measures)) + + obj = ObjectiveFSelect$new(task = task, learner = learner, resampling = resampling, measures = measures, archive = archive, store_models = TRUE) xss = list( list("x1" = TRUE, "x2" = FALSE, "x3" = TRUE, "x4" = TRUE), list("x1" = FALSE, "x2" = TRUE, "x3" = TRUE, "x4" = TRUE)) z = obj$eval_many(xss) - expect_data_table(z, nrows = 2, ncols = 3) + expect_data_table(z, nrows = 2, ncols = 5) expect_equal(obj$archive$benchmark_result$resample_result(1)$learners[[1]]$model$select$selection, c("x1", "x3", "x4")) expect_equal(obj$archive$benchmark_result$resample_result(2)$learners[[1]]$model$select$selection, c("x2", "x3", "x4")) }) @@ -23,15 +24,15 @@ test_that("ObjectiveFSelect works with multiple measures", { resampling = rsmp("holdout") measures = msrs(c("regr.mse", "regr.rmse")) - obj = ObjectiveFSelect$new(task = task, learner = learner, - resampling = resampling, measures = measures, store_models = TRUE) + archive = ArchiveFSelect$new(search_space = task_to_domain(task), codomain = measures_to_codomain(measures)) + obj = ObjectiveFSelect$new(task = task, learner = learner, resampling = resampling, measures = measures, archive = archive, store_models = TRUE) xss = list( list("x1" = TRUE, "x2" = FALSE, "x3" = TRUE, "x4" = TRUE), list("x1" = FALSE, "x2" = TRUE, "x3" = TRUE, "x4" = TRUE)) z = obj$eval_many(xss) - expect_data_table(z, nrows = 2, ncols = 4) + expect_data_table(z, nrows = 2, ncols = 6) }) test_that("ObjectiveFSelect works with store_models", { @@ -40,8 +41,9 @@ test_that("ObjectiveFSelect works with store_models", { resampling = rsmp("holdout") measures = msr("dummy") + archive = ArchiveFSelect$new(search_space = task_to_domain(task), codomain = measures_to_codomain(measures)) obj = ObjectiveFSelect$new(task = task, learner = learner, - resampling = resampling, measures = measures, + resampling = resampling, measures = measures, archive = archive, store_models = TRUE) xss = list( diff --git a/tests/testthat/extract_inner_fselect_archives.R b/tests/testthat/test_extract_inner_fselect_archives.R similarity index 55% rename from tests/testthat/extract_inner_fselect_archives.R rename to tests/testthat/test_extract_inner_fselect_archives.R index 46b82fe4..06299de0 100644 --- a/tests/testthat/extract_inner_fselect_archives.R +++ b/tests/testthat/test_extract_inner_fselect_archives.R @@ -1,23 +1,22 @@ -test_that("extract_inner_fselect_archives function works", { - # cv +test_that("extract_inner_fselect_archives function works with resample and cv", { rr = fselect_nested("random_search", tsk("iris"), lrn("classif.rpart"), rsmp("holdout"), rsmp("cv", folds = 2), msr("classif.ce"), term_evals = 4) irr = extract_inner_fselect_archives(rr) expect_data_table(irr, nrows = 8) - expect_named(irr, c("iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", "classif.ce", - "runtime_learners", "timestamp", "batch_nr", "resample_result", "task_id", "learner_id", "resampling_id")) + expect_named(irr, c("iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", "classif.ce", "runtime_learners", "timestamp", "batch_nr", "warnings", "errors", "resample_result", "task_id", "learner_id", "resampling_id")) +}) - # repeated cv - rr = fselect_nested("random_search", tsk("iris"), lrn("classif.rpart"), rsmp("holdout"), +test_that("extract_inner_fselect_archives function works with resample and repeated cv", { + rr = fselect_nested("random_search", tsk("iris"), lrn("classif.rpart"), rsmp("holdout"), rsmp("repeated_cv", folds = 2, repeats = 3), msr("classif.ce"), term_evals = 4) irr = extract_inner_fselect_archives(rr) expect_data_table(irr, nrows = 24) - expect_named(irr, c("iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", "classif.ce", - "runtime_learners", "timestamp", "batch_nr", "resample_result", "task_id", "learner_id", "resampling_id")) + expect_named(irr, c("iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", "classif.ce", "runtime_learners", "timestamp", "batch_nr", "warnings", "errors", "resample_result", "task_id", "learner_id", "resampling_id")) +}) - # cv +test_that("extract_inner_fselect_archives function works with benchmark and cv", { at_1 = auto_fselector("random_search", lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), term_evals = 4) at_2 = auto_fselector("random_search", lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), term_evals = 4) resampling_outer = rsmp("cv", folds = 2) @@ -26,12 +25,11 @@ test_that("extract_inner_fselect_archives function works", { ibmr = extract_inner_fselect_archives(bmr) expect_data_table(ibmr, nrows = 16) - expect_named(ibmr, c("experiment", "iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", - "classif.ce", "runtime_learners", "timestamp", "batch_nr", "resample_result", "task_id", "learner_id", - "resampling_id")) + expect_named(ibmr, c("experiment", "iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", "classif.ce", "runtime_learners", "timestamp", "batch_nr", "warnings", "errors", "resample_result", "task_id", "learner_id", "resampling_id")) expect_equal(unique(ibmr$experiment), c(1, 2)) +}) - # repeated cv +test_that("extract_inner_fselect_archives function works with benchmark and repeated cv", { at_1 = auto_fselector("random_search", lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), term_evals = 4) at_2 = auto_fselector("random_search", lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), term_evals = 4) resampling_outer = rsmp("repeated_cv", folds = 2, repeats = 3) @@ -40,12 +38,11 @@ test_that("extract_inner_fselect_archives function works", { ibmr = extract_inner_fselect_archives(bmr) expect_data_table(ibmr, nrows = 48) - expect_named(ibmr, c("experiment", "iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", - "classif.ce", "runtime_learners", "timestamp", "batch_nr", "resample_result", "task_id", "learner_id", - "resampling_id")) + expect_named(ibmr, c("experiment", "iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", "classif.ce", "runtime_learners", "timestamp", "batch_nr", "warnings", "errors", "resample_result", "task_id", "learner_id", "resampling_id")) expect_equal(unique(ibmr$experiment), c(1, 2)) +}) - # different tasks +test_that("extract_inner_fselect_archives function works with multiple tasks", { at_1 = auto_fselector("random_search", lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), term_evals = 4) at_2 = auto_fselector("random_search", lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), term_evals = 4) resampling_outer = rsmp("cv", folds = 2) @@ -54,59 +51,60 @@ test_that("extract_inner_fselect_archives function works", { ibmr = extract_inner_fselect_archives(bmr) expect_data_table(ibmr, nrows = 32) - expect_named(ibmr, c("experiment", "iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", "age", - "glucose", "insulin", "mass", "pedigree", "pregnant", "pressure", "triceps", "classif.ce", "runtime_learners", - "timestamp", "batch_nr", "resample_result", "task_id", "learner_id", "resampling_id")) + expect_named(ibmr, c("experiment", "iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", "age", "glucose", "insulin", "mass", "pedigree", "pregnant", "pressure", "triceps", "classif.ce", "runtime_learners", "timestamp", "batch_nr", "warnings", "errors", "resample_result", "task_id", "learner_id", "resampling_id")) expect_equal(unique(ibmr$experiment), c(1, 2, 3, 4)) +}) - # no models +test_that("extract_inner_fselect_archives function works with no models", { at = auto_fselector("random_search", lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), term_evals = 4) resampling_outer = rsmp("cv", folds = 2) rr = resample(tsk("iris"), at, resampling_outer, store_models = FALSE) expect_data_table(extract_inner_fselect_archives(rr), nrows = 0, ncols = 0) +}) - # no instance - at = AutoFSelector$new(lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), trm("evals", n_evals = 4), - fselector = fs("random_search"), store_fselect_instance = FALSE, store_benchmark_result = FALSE) +test_that("extract_inner_fselect_archives function works with no instance", { + at = AutoFSelector$new(lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), trm("evals", n_evals = 4), + fselector = fs("random_search"), store_fselect_instance = FALSE, store_benchmark_result = FALSE) resampling_outer = rsmp("cv", folds = 2) rr = resample(tsk("iris"), at, resampling_outer, store_models = TRUE) expect_data_table(extract_inner_fselect_archives(rr), nrows = 0, ncols = 0) +}) - # no models +test_that("extract_inner_fselect_archives function works with benchmark and no models", { at_1 = auto_fselector("random_search", lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), term_evals = 4) at_2 = auto_fselector("random_search", lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), term_evals = 4) resampling_outer = rsmp("cv", folds = 2) grid = benchmark_grid(tsk("iris"), list(at_1, at_2), resampling_outer) bmr = benchmark(grid, store_models = FALSE) - expect_data_table(extract_inner_fselect_archives(rr), nrows = 0, ncols = 0) + expect_data_table(extract_inner_fselect_archives(bmr), nrows = 0, ncols = 0) +}) - # https://github.com/mlr-org/mlr3/issues/647 - # mixed store instance - # at_1 = AutoFSelector$new(lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), trm("evals", n_evals = 4), - # fselector = fs("random_search"), store_fselect_instance = FALSE, store_benchmark_result = FALSE) - # at_2 = AutoFSelector$new(lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), trm("evals", n_evals = 4), - # fselector = fs("random_search")) - # resampling_outer = rsmp("cv", folds = 2) - # grid = benchmark_grid(tsk("iris"), list(at_1, at_2), resampling_outer) - # bmr = benchmark(grid, store_models = TRUE) - - # ibmr = extract_inner_fselect_archives(bmr) - # expect_data_table(ibmr, nrows = 2, ncols = 9) - # expect_equal(unique(ibmr$experiment), 2) - - # autotuner and learner +test_that("extract_inner_fselect_archives function works with mixed store instance", { + at_1 = AutoFSelector$new(lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), trm("evals", n_evals = 4), + fselector = fs("random_search"), store_fselect_instance = FALSE, store_benchmark_result = FALSE) + at_2 = AutoFSelector$new(lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), trm("evals", n_evals = 4), + fselector = fs("random_search")) + resampling_outer = rsmp("cv", folds = 2) + grid = benchmark_grid(tsk("iris"), list(at_1, at_2), resampling_outer) + bmr = benchmark(grid, store_models = TRUE) + + ibmr = extract_inner_fselect_archives(bmr) + expect_data_table(ibmr, ncols = 16) + expect_equal(unique(ibmr$experiment), 2) +}) + +test_that("extract_inner_fselect_archives function works with autofselector and learner", { learner = lrn("classif.rpart") at = auto_fselector("random_search", lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), term_evals = 4) + resampling_outer = rsmp("cv", folds = 2) grid = benchmark_grid(tsk("iris"), list(at, learner), resampling_outer) bmr = benchmark(grid, store_models = TRUE) ibmr = extract_inner_fselect_archives(bmr) expect_data_table(ibmr, nrows = 8) - expect_named(ibmr, c("experiment", "iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", - "classif.ce", "runtime_learners", "timestamp", "batch_nr", "resample_result", "task_id", "learner_id", - "resampling_id")) + expect_named(ibmr, c("experiment", "iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", "classif.ce", "runtime_learners", "timestamp", "batch_nr", "warnings", "errors", "resample_result", "task_id", "learner_id", "resampling_id")) expect_equal(unique(ibmr$experiment), 1) }) diff --git a/tests/testthat/test_extract_inner_fselect_result.R b/tests/testthat/test_extract_inner_fselect_result.R index 8168412c..bd908aa3 100644 --- a/tests/testthat/test_extract_inner_fselect_result.R +++ b/tests/testthat/test_extract_inner_fselect_result.R @@ -1,23 +1,22 @@ -test_that("extract_inner_fselect_results function works", { - # cv +test_that("extract_inner_fselect_results function works with resample and cv", { rr = fselect_nested("random_search", tsk("iris"), lrn("classif.rpart"), rsmp("holdout"), rsmp("cv", folds = 2), msr("classif.ce"), term_evals = 4) irr = extract_inner_fselect_results(rr) expect_data_table(irr, nrows = 2) - expect_named(irr, c("iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", "classif.ce", - "features", "task_id", "learner_id", "resampling_id")) + expect_named(irr, c("iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", "classif.ce", "features", "task_id", "learner_id", "resampling_id")) +}) - # repeated cv - rr = fselect_nested("random_search", tsk("iris"), lrn("classif.rpart"), rsmp("holdout"), +test_that("extract_inner_fselect_results function works with resample and repeated cv", { + rr = fselect_nested("random_search", tsk("iris"), lrn("classif.rpart"), rsmp("holdout"), rsmp("repeated_cv", folds = 2, repeats = 3), msr("classif.ce"), term_evals = 4) - + irr = extract_inner_fselect_results(rr) expect_data_table(irr, nrows = 6) - expect_named(irr, c("iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", "classif.ce", - "features", "task_id", "learner_id", "resampling_id")) + expect_named(irr, c("iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", "classif.ce", "features", "task_id", "learner_id", "resampling_id")) +}) - # cv +test_that("extract_inner_fselect_results function works with benchmark and cv", { at_1 = auto_fselector("random_search", lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), term_evals = 4) at_2 = auto_fselector("random_search", lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), term_evals = 4) resampling_outer = rsmp("cv", folds = 2) @@ -26,11 +25,11 @@ test_that("extract_inner_fselect_results function works", { ibmr = extract_inner_fselect_results(bmr) expect_data_table(ibmr, nrows = 4) - expect_named(ibmr, c("experiment", "iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", - "classif.ce", "features", "task_id", "learner_id", "resampling_id")) + expect_named(ibmr, c("experiment", "iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", "classif.ce", "features", "task_id", "learner_id", "resampling_id")) expect_equal(unique(ibmr$experiment), c(1, 2)) +}) - # repeated cv +test_that("extract_inner_fselect_results function works with benchmark and repeated cv", { at_1 = auto_fselector("random_search", lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), term_evals = 4) at_2 = auto_fselector("random_search", lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), term_evals = 4) resampling_outer = rsmp("repeated_cv", folds = 2, repeats = 3) @@ -39,11 +38,11 @@ test_that("extract_inner_fselect_results function works", { ibmr = extract_inner_fselect_results(bmr) expect_data_table(ibmr, nrows = 12) - expect_named(ibmr, c("experiment", "iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", - "classif.ce", "features", "task_id", "learner_id", "resampling_id")) + expect_named(ibmr, c("experiment", "iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", "classif.ce", "features", "task_id", "learner_id", "resampling_id")) expect_equal(unique(ibmr$experiment), c(1, 2)) +}) - # different tasks +test_that("extract_inner_fselect_results function works with multiple tasks", { at_1 = auto_fselector("random_search", lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), term_evals = 4) at_2 = auto_fselector("random_search", lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), term_evals = 4) resampling_outer = rsmp("cv", folds = 2) @@ -52,27 +51,28 @@ test_that("extract_inner_fselect_results function works", { ibmr = extract_inner_fselect_results(bmr) expect_data_table(ibmr, nrows = 8) - expect_named(ibmr, c("experiment", "iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", "age", - "glucose", "insulin", "mass", "pedigree", "pregnant", "pressure", "triceps", "classif.ce", "features", "task_id", - "learner_id", "resampling_id")) + expect_named(ibmr, c("experiment", "iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", "age", "glucose", "insulin", "mass", "pedigree", "pregnant", "pressure", "triceps", "classif.ce", "features", "task_id", "learner_id", "resampling_id")) expect_equal(unique(ibmr$experiment), c(1, 2, 3, 4)) +}) - # no model +test_that("extract_inner_fselect_results function works with no model", { at = auto_fselector("random_search", lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), term_evals = 4) resampling_outer = rsmp("cv", folds = 2) rr = resample(tsk("iris"), at, resampling_outer, store_models = FALSE) expect_data_table(extract_inner_fselect_results(rr), nrows = 0, ncols = 0) +}) - # instance - at = AutoFSelector$new(lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), trm("evals", n_evals = 4), +test_that("extract_inner_fselect_results function works no instance", { + at = AutoFSelector$new(lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), trm("evals", n_evals = 4), fselector = fs("random_search"), store_fselect_instance = FALSE, store_benchmark_result = FALSE) resampling_outer = rsmp("cv", folds = 2) rr = resample(tsk("iris"), at, resampling_outer, store_models = TRUE) expect_data_table(extract_inner_fselect_results(rr), nrows = 0, ncols = 0) +}) - # no models +test_that("extract_inner_fselect_results function works with benchmark and no models", { at_1 = auto_fselector("random_search", lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), term_evals = 4) at_2 = auto_fselector("random_search", lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), term_evals = 4) resampling_outer = rsmp("cv", folds = 2) @@ -80,30 +80,52 @@ test_that("extract_inner_fselect_results function works", { bmr = benchmark(grid, store_models = FALSE) expect_data_table(extract_inner_fselect_results(bmr), nrows = 0, ncols = 0) +}) + +test_that("extract_inner_fselect_results function works with mixed store instance", { + at_1 = AutoFSelector$new(lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), trm("evals", n_evals = 4), + fselector = fs("random_search"), store_fselect_instance = FALSE, store_benchmark_result = FALSE) + at_2 = AutoFSelector$new(lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), trm("evals", n_evals = 4), + fselector = fs("random_search")) + resampling_outer = rsmp("cv", folds = 2) + grid = benchmark_grid(tsk("iris"), list(at_1, at_2), resampling_outer) + bmr = benchmark(grid, store_models = TRUE) + + ibmr = extract_inner_fselect_results(bmr) + expect_data_table(ibmr, nrows = 2, ncols = 11) + expect_equal(unique(ibmr$experiment), 2) +}) - # https://github.com/mlr-org/mlr3/issues/647 - # mixed store instance - # at_1 = AutoFSelector$new(lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), trm("evals", n_evals = 4), - # fselector = fs("random_search"), store_fselect_instance = FALSE, store_benchmark_result = FALSE) - # at_2 = AutoFSelector$new(lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), trm("evals", n_evals = 4), - # fselector = fs("random_search")) - # resampling_outer = rsmp("cv", folds = 2) - # grid = benchmark_grid(tsk("iris"), list(at_1, at_2), resampling_outer) - # bmr = benchmark(grid, store_models = TRUE) - - # ibmr = extract_inner_fselect_results(bmr) - # expect_data_table(ibmr, nrows = 2, ncols = 9) - # expect_equal(unique(ibmr$experiment), 2) - - # autotuner and learner +test_that("extract_inner_fselect_results function works with learner and autotuner", { learner = lrn("classif.rpart") at = auto_fselector("random_search", lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), term_evals = 4) + resampling_outer = rsmp("cv", folds = 2) grid = benchmark_grid(tsk("iris"), list(at, learner), resampling_outer) bmr = benchmark(grid, store_models = TRUE) ibmr = extract_inner_fselect_results(bmr) expect_data_table(ibmr, nrows = 2) - expect_named(ibmr, c("experiment", "iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", - "classif.ce", "features", "task_id", "learner_id", "resampling_id")) + expect_named(ibmr, c("experiment", "iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", "classif.ce", "features", "task_id", "learner_id", "resampling_id")) expect_equal(unique(ibmr$experiment), 1) }) + +test_that("extract_inner_fselect_results function works with resample and return of instance", { + rr = fselect_nested("random_search", tsk("iris"), lrn("classif.rpart"), rsmp("holdout"), rsmp("cv", folds = 2), msr("classif.ce"), term_evals = 4) + + irr = extract_inner_fselect_results(rr, fselect_instance = TRUE) + expect_data_table(irr, nrows = 2) + expect_named(irr, c("iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", "classif.ce", "features", "fselect_instance", "task_id", "learner_id", "resampling_id")) +}) + +test_that("extract_inner_fselect_results function works with benchmark and return of instance", { + at_1 = auto_fselector("random_search", lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), term_evals = 4) + at_2 = auto_fselector("random_search", lrn("classif.rpart"), rsmp("holdout"), msr("classif.ce"), term_evals = 4) + resampling_outer = rsmp("cv", folds = 2) + grid = benchmark_grid(tsk("iris"), list(at_1, at_2), resampling_outer) + bmr = benchmark(grid, store_models = TRUE) + + ibmr = extract_inner_fselect_results(bmr, fselect_instance = TRUE) + expect_data_table(ibmr, nrows = 4) + expect_named(ibmr, c("experiment", "iteration", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", "classif.ce", "features", "fselect_instance", "task_id", "learner_id", "resampling_id")) + expect_equal(unique(ibmr$experiment), c(1, 2)) +}) diff --git a/tests/testthat/test_fsi.R b/tests/testthat/test_fsi.R new file mode 100644 index 00000000..f9cec3b4 --- /dev/null +++ b/tests/testthat/test_fsi.R @@ -0,0 +1,19 @@ +test_that("fsi function creates a FSelectInstanceSingleCrit", { + instance = fsi( + task = tsk("pima"), + learner = lrn("classif.rpart"), + resampling = rsmp ("holdout"), + measures = msr("classif.ce"), + terminator = trm("evals", n_evals = 2)) + expect_class(instance, "FSelectInstanceSingleCrit") +}) + +test_that("fsi function creates a FSelectInstanceMultiCrit", { + instance = fsi( + task = tsk("pima"), + learner = lrn("classif.rpart"), + resampling = rsmp ("holdout"), + measures = msrs(c("classif.ce", "classif.acc")), + terminator = trm("evals", n_evals = 2)) + expect_class(instance, "FSelectInstanceMultiCrit") +})