diff --git a/CHANGELOG.md b/CHANGELOG.md index dcf89b9ae..e90a7b268 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ This release comes with several API changes. For an updated overview of the avai ### API Changes - The options `min_samples` and `max_samples` have been added to the values of the command line arguments `--feature-sampling` and `--instance-sampling`. +- Similar to other sampling methods, the options `sample_size`, `min_samples`, and `max_samples` can now be specified via the command line argument `--feature-sampling` when set to the value `without-replacement`. ### Quality-of-Life Improvements diff --git a/cpp/subprojects/common/include/mlrl/common/sampling/output_sampling_without_replacement.hpp b/cpp/subprojects/common/include/mlrl/common/sampling/output_sampling_without_replacement.hpp index bfda43bce..b59068b5a 100644 --- a/cpp/subprojects/common/include/mlrl/common/sampling/output_sampling_without_replacement.hpp +++ b/cpp/subprojects/common/include/mlrl/common/sampling/output_sampling_without_replacement.hpp @@ -17,20 +17,54 @@ class MLRLCOMMON_API IOutputSamplingWithoutReplacementConfig { virtual ~IOutputSamplingWithoutReplacementConfig() {} /** - * Returns the number of outputs that are included in a sample. + * Returns the fraction of outputs that are included in a sample. * - * @return The number of outputs that are included in a sample + * @return The fraction of outputs that are included in a sample */ - virtual uint32 getNumSamples() const = 0; + virtual float32 getSampleSize() const = 0; /** - * Sets the number of outputs that should be included in a sample. + * Sets the fraction of outputs that should be included in a sample. * - * @param numSamples The number of outputs that should be included in a sample. Must be at least 1 + * @param sampleSize The fraction of outputs that should be included in a sample, e.g., a value of 0.6 + * corresponds to 60 % of the available outputs. Must be in (0, 1) * @return A reference to an object of type `IOutputSamplingWithoutReplacementConfig` that allows - * further configuration of the sampling method + * further configuration of the method for sampling instances */ - virtual IOutputSamplingWithoutReplacementConfig& setNumSamples(uint32 numSamples) = 0; + virtual IOutputSamplingWithoutReplacementConfig& setSampleSize(float32 sampleSize) = 0; + + /** + * Returns the minimum number of outputs that are included in a sample. + * + * @return The minimum number of outputs that are included in a sample + */ + virtual uint32 getMinSamples() const = 0; + + /** + * Sets the minimum number of outputs that should be included in a sample. + * + * @param minSamples The minimum number of outputs that should be included in a sample. Must be at least 1 + * @return A reference to an object of type `IOutputSamplingWithoutReplacementConfig` that allows + * further configuration of the method for sampling instances + */ + virtual IOutputSamplingWithoutReplacementConfig& setMinSamples(uint32 minSamples) = 0; + + /** + * Returns the maximum number of outputs that are included in a sample. + * + * @return The maximum number of outputs that are included in a sample + */ + virtual uint32 getMaxSamples() const = 0; + + /** + * Sets the maximum number of outputs that should be included in a sample. + * + * @param maxSamples The maximum number of outputs that should be included in a sample. Must be at the value + * returned by `getMaxSamples` or 0, if the number of outputs should not be restricted + * @return A reference to an object of type `IOutputSamplingWithoutReplacementConfig` that allows + * further configuration of the method for sampling instances + */ + virtual IOutputSamplingWithoutReplacementConfig& setMaxSamples(uint32 maxSamples) = 0; }; /** @@ -42,7 +76,11 @@ class OutputSamplingWithoutReplacementConfig final : public IOutputSamplingConfi const ReadableProperty rngConfig_; - uint32 numSamples_; + float32 sampleSize_; + + uint32 minSamples_; + + uint32 maxSamples_; public: @@ -52,9 +90,17 @@ class OutputSamplingWithoutReplacementConfig final : public IOutputSamplingConfi */ OutputSamplingWithoutReplacementConfig(ReadableProperty rngConfig); - uint32 getNumSamples() const override; + float32 getSampleSize() const override; + + IOutputSamplingWithoutReplacementConfig& setSampleSize(float32 sampleSize) override; + + uint32 getMinSamples() const override; + + IOutputSamplingWithoutReplacementConfig& setMinSamples(uint32 minSamples) override; + + uint32 getMaxSamples() const override; - IOutputSamplingWithoutReplacementConfig& setNumSamples(uint32 numSamples) override; + IOutputSamplingWithoutReplacementConfig& setMaxSamples(uint32 maxSamples) override; std::unique_ptr createOutputSamplingFactory( const IOutputMatrix& outputMatrix) const override; diff --git a/cpp/subprojects/common/src/mlrl/common/sampling/output_sampling_without_replacement.cpp b/cpp/subprojects/common/src/mlrl/common/sampling/output_sampling_without_replacement.cpp index 18800e332..8ff21b8aa 100644 --- a/cpp/subprojects/common/src/mlrl/common/sampling/output_sampling_without_replacement.cpp +++ b/cpp/subprojects/common/src/mlrl/common/sampling/output_sampling_without_replacement.cpp @@ -3,6 +3,7 @@ #include "index_sampling.hpp" #include "mlrl/common/indices/index_vector_partial.hpp" #include "mlrl/common/iterator/iterator_index.hpp" +#include "mlrl/common/util/math.hpp" #include "mlrl/common/util/validation.hpp" /** @@ -67,20 +68,43 @@ class OutputSamplingWithoutReplacementFactory final : public IOutputSamplingFact }; OutputSamplingWithoutReplacementConfig::OutputSamplingWithoutReplacementConfig(ReadableProperty rngConfig) - : rngConfig_(rngConfig), numSamples_(1) {} + : rngConfig_(rngConfig), sampleSize_(0.33f), minSamples_(1), maxSamples_(1) {} -uint32 OutputSamplingWithoutReplacementConfig::getNumSamples() const { - return numSamples_; +float32 OutputSamplingWithoutReplacementConfig::getSampleSize() const { + return sampleSize_; } -IOutputSamplingWithoutReplacementConfig& OutputSamplingWithoutReplacementConfig::setNumSamples(uint32 numSamples) { - util::assertGreaterOrEqual("numSamples", numSamples, 1); - numSamples_ = numSamples; +IOutputSamplingWithoutReplacementConfig& OutputSamplingWithoutReplacementConfig::setSampleSize(float32 sampleSize) { + util::assertGreater("sampleSize", sampleSize, 0); + util::assertLess("sampleSize", sampleSize, 1); + sampleSize_ = sampleSize; + return *this; +} + +uint32 OutputSamplingWithoutReplacementConfig::getMinSamples() const { + return minSamples_; +} + +IOutputSamplingWithoutReplacementConfig& OutputSamplingWithoutReplacementConfig::setMinSamples(uint32 minSamples) { + util::assertGreaterOrEqual("minSamples", minSamples, 1); + minSamples_ = minSamples; + return *this; +} + +uint32 OutputSamplingWithoutReplacementConfig::getMaxSamples() const { + return maxSamples_; +} + +IOutputSamplingWithoutReplacementConfig& OutputSamplingWithoutReplacementConfig::setMaxSamples(uint32 maxSamples) { + if (maxSamples != 0) util::assertGreaterOrEqual("maxSamples", maxSamples, minSamples_); + maxSamples_ = maxSamples; return *this; } std::unique_ptr OutputSamplingWithoutReplacementConfig::createOutputSamplingFactory( const IOutputMatrix& outputMatrix) const { + uint32 numOutputs = outputMatrix.getNumOutputs(); + uint32 numSamples = util::calculateBoundedFraction(numOutputs, sampleSize_, minSamples_, maxSamples_); return std::make_unique(rngConfig_.get().createRNGFactory(), - outputMatrix.getNumOutputs(), numSamples_); + outputMatrix.getNumOutputs(), numSamples); } diff --git a/doc/user_guide/boosting/parameters.md b/doc/user_guide/boosting/parameters.md index c1ccbcc8b..1ff33ba22 100644 --- a/doc/user_guide/boosting/parameters.md +++ b/doc/user_guide/boosting/parameters.md @@ -405,9 +405,17 @@ The seed to be used by random number generators. The given value must be at leas `'without-replacement'` The outputs to be considered when learning a new rule are chosen randomly. The following options may be provided using the {ref}`bracket notation`: - - `num_samples` *(Default value = `1`)* + - `sample_size` *(Default value = `0.33`)* - The number of outputs to be included in a sample. The given value must be at least 1. + The percentage of outputs to be included in a sample. For example, a value of 0.6 corresponds to 60% of the outputs. The given value must be in (0, 1\]. + + - `min_samples` *(Default value = `1`)* + + The minimum number of outputs to be included in a sample. The given value must be at least 1. + + - `max_samples` *(Default value = `1`)* + + The maximum number of outputs to be included in a sample. The given value must be at least the value of `min_samples` or 0, if the number of outputs should not be restricted. ``` (boosting_parameters_feature_sampling)= diff --git a/doc/user_guide/seco/parameters.md b/doc/user_guide/seco/parameters.md index 5791f21b3..9a34b3356 100644 --- a/doc/user_guide/seco/parameters.md +++ b/doc/user_guide/seco/parameters.md @@ -335,9 +335,17 @@ The seed to be used by random number generators. The given value must be at leas `'without-replacement'` The outputs to be considered when learning a new rule are chosen randomly. The following options may be provided using the {ref}`bracket notation`: - - `num_samples` *(Default value = `1`)* + - `sample_size` *(Default value = `0.33`)* - The number of outputs to be included in a sample. The given value must be at least 1. + The percentage of outputs to be included in a sample. For example, a value of 0.6 corresponds to 60% of the outputs. The given value must be in (0, 1\]. + + - `min_samples` *(Default value = `1`)* + + The minimum number of outputs to be included in a sample. The given value must be at least 1. + + - `max_samples` *(Default value = `1`)* + + The maximum number of outputs to be included in a sample. The given value must be at least the value of `min_samples` or 0, if the number of outputs should not be restricted. ``` (seco_parameters_feature_sampling)= diff --git a/python/subprojects/common/mlrl/common/config.py b/python/subprojects/common/mlrl/common/config.py index 9f47f8438..bf6edc859 100644 --- a/python/subprojects/common/mlrl/common/config.py +++ b/python/subprojects/common/mlrl/common/config.py @@ -46,8 +46,6 @@ OPTION_MAX_SAMPLES = 'max_samples' -OPTION_NUM_SAMPLES = 'num_samples' - BINNING_EQUAL_FREQUENCY = 'equal-frequency' BINNING_EQUAL_WIDTH = 'equal-width' @@ -398,7 +396,7 @@ def __init__(self): self.add_value(name=NONE, mixin=NoFeatureSamplingMixin) self.add_value(name=SAMPLING_WITHOUT_REPLACEMENT, mixin=FeatureSamplingWithoutReplacementMixin, - options={OPTION_NUM_SAMPLES}) + options={OPTION_SAMPLE_SIZE, OPTION_MIN_SAMPLES, OPTION_MAX_SAMPLES}) self.add_value(name=self.OUTPUT_SAMPLING_ROUND_ROBIN, mixin=RoundRobinOutputSamplingMixin) def _configure(self, config, value: str, options: Optional[Options]): @@ -406,7 +404,9 @@ def _configure(self, config, value: str, options: Optional[Options]): config.use_no_output_sampling() elif value == SAMPLING_WITHOUT_REPLACEMENT: conf = config.use_output_sampling_without_replacement() - conf.set_num_samples(options.get_int(OPTION_NUM_SAMPLES, conf.get_num_samples())) + conf.set_sample_size(options.get_float(OPTION_SAMPLE_SIZE, conf.get_sample_size())) + conf.set_min_samples(options.get_int(OPTION_MIN_SAMPLES, conf.get_min_samples())) + conf.set_max_samples(options.get_int(OPTION_MAX_SAMPLES, conf.get_max_samples())) elif value == self.OUTPUT_SAMPLING_ROUND_ROBIN: config.use_round_robin_output_sampling() diff --git a/python/subprojects/common/mlrl/common/cython/output_sampling.pxd b/python/subprojects/common/mlrl/common/cython/output_sampling.pxd index cbf44405a..63e6ba026 100644 --- a/python/subprojects/common/mlrl/common/cython/output_sampling.pxd +++ b/python/subprojects/common/mlrl/common/cython/output_sampling.pxd @@ -1,4 +1,4 @@ -from mlrl.common.cython._types cimport uint32 +from mlrl.common.cython._types cimport float32, uint32 cdef extern from "mlrl/common/sampling/output_sampling_without_replacement.hpp" nogil: @@ -7,9 +7,17 @@ cdef extern from "mlrl/common/sampling/output_sampling_without_replacement.hpp" # Functions: - uint32 getNumSamples() const + float32 getSampleSize() const - IOutputSamplingWithoutReplacementConfig& setNumSamples(uint32 numSamples) except + + IOutputSamplingWithoutReplacementConfig& setSampleSize(float32 sampleSize) + + uint32 getMinSamples() const + + IOutputSamplingWithoutReplacementConfig& setMinSamples(float32 minSamples) + + uint32 getMaxSamples() const + + IOutputSamplingWithoutReplacementConfig& setMaxSamples(float32 maxSamples) cdef class OutputSamplingWithoutReplacementConfig: diff --git a/python/subprojects/common/mlrl/common/cython/output_sampling.pyx b/python/subprojects/common/mlrl/common/cython/output_sampling.pyx index bc7d5c270..399a06b14 100644 --- a/python/subprojects/common/mlrl/common/cython/output_sampling.pyx +++ b/python/subprojects/common/mlrl/common/cython/output_sampling.pyx @@ -1,7 +1,7 @@ """ @author: Michael Rapp (michael.rapp.ml@gmail.com) """ -from mlrl.common.cython.validation import assert_greater_or_equal +from mlrl.common.cython.validation import assert_greater, assert_greater_or_equal, assert_less cdef class OutputSamplingWithoutReplacementConfig: @@ -9,22 +9,66 @@ cdef class OutputSamplingWithoutReplacementConfig: Allows to configure a method for sampling outputs without replacement. """ - def get_num_samples(self) -> int: + def get_sample_size(self) -> float: """ - Returns the number of outputs that are included in a sample. + Returns the fraction of outputs that are included in a sample. - :return: The number of outputs that are included in a sample + :return: The fraction of outputs that are included in a sample """ - return self.config_ptr.getNumSamples() + return self.config_ptr.getSampleSize() - def set_num_samples(self, num_samples: int) -> OutputSamplingWithoutReplacementConfig: + def set_sample_size(self, sample_size: float) -> OutputSamplingWithoutReplacementConfig: """ - Sets the number of outputs that should be included in a sample. + Sets the fraction of outputs that should be included in a sample. - :param num_samples: The number of outputs that should be included in a sample. Must be at least 1 - :return: An `OutputSamplingWithoutReplacementConfig` that allows further configuration of the - sampling method + :param sample_size: The fraction of outputs that should be included in a sample, e.g., a value of 0.6 + corresponds to 60 % of the available outputs. Must be in (0, 1) + :return: An `OutputSamplingWithoutReplacementConfig` that allows further configuration of the method + for sampling outputs """ - assert_greater_or_equal('num_samples', num_samples, 1) - self.config_ptr.setNumSamples(num_samples) + assert_greater('sample_size', sample_size, 0) + assert_less('sample_size', sample_size, 1) + self.config_ptr.setSampleSize(sample_size) + return self + + def get_min_samples(self) -> int: + """ + Returns the minimum number of outputs that are included in a sample. + + :return: The minimum number of outputs that are included in a sample + """ + return self.config_ptr.getMinSamples() + + def set_min_samples(self, min_samples: int) -> OutputSamplingWithoutReplacementConfig: + """ + Sets the minimum number of outputs that should be included in a sample. + + :param min_samples: The minimum number of outputs that should be included in a sample. Must be at least 1 + :return: An `OutputSamplingWithoutReplacementConfig` that allows further configuration of the method + for sampling outputs + """ + assert_greater_or_equal('min_samples', min_samples, 1) + self.config_ptr.setMinSamples(min_samples) + return self + + def get_max_samples(self) -> int: + """ + Returns the maximum number of outputs that are included in a sample. + + :return: The maximum number of outputs that are included in a sample + """ + return self.config_ptr.getMaxSamples() + + def set_max_samples(self, max_samples: int) -> OutputSamplingWithoutReplacementConfig: + """ + Sets the maximum number of outputs that should be included in a sample. + + :param max_samples: The maximum number of outputs that should be included in a sample. Must be at least + `get_min_samples()` or 0, if the number of outputs should not be restricted + :return: An `OutputSamplingWithoutReplacementConfig` that allows further configuration of the method + for sampling outputs + """ + if max_samples != 0: + assert_greater_or_equal('max_samples', max_samples, self.get_min_samples()) + self.config_ptr.setMaxSamples(max_samples) return self