diff --git a/CHANGELOG.md b/CHANGELOG.md index 661b58434..dcf89b9ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,14 @@ tocdepth: 2 A major update to the BOOMER algorithm that comes with the following changes. +```{warning} +This release comes with several API changes. For an updated overview of the available parameters and command line arguments, please refer to the [documentation](https://mlrl-boomer.readthedocs.io/en/0.11.0/). +``` + +### API Changes + +- The options `min_samples` and `max_samples` have been added to the values of the command line arguments `--feature-sampling` and `--instance-sampling`. + ### Quality-of-Life Improvements - C++ 20 is now required for compiling the project. diff --git a/cpp/subprojects/common/include/mlrl/common/sampling/feature_sampling_without_replacement.hpp b/cpp/subprojects/common/include/mlrl/common/sampling/feature_sampling_without_replacement.hpp index af1dc94a5..a2b22863a 100644 --- a/cpp/subprojects/common/include/mlrl/common/sampling/feature_sampling_without_replacement.hpp +++ b/cpp/subprojects/common/include/mlrl/common/sampling/feature_sampling_without_replacement.hpp @@ -34,6 +34,39 @@ class MLRLCOMMON_API IFeatureSamplingWithoutReplacementConfig { */ virtual IFeatureSamplingWithoutReplacementConfig& setSampleSize(float32 sampleSize) = 0; + /** + * Returns the minimum number of features that are included in a sample. + * + * @return The minimum number of features that are included in a sample + */ + virtual uint32 getMinSamples() const = 0; + + /** + * Sets the minimum number of features that should be included in a sample. + * + * @param minSamples The minimum number of features that should be included in a sample. Must be at least 1 + * @return A reference to an object of type `IFeatureSamplingWithoutReplacementConfig` that allows + * further configuration of the method for sampling features + */ + virtual IFeatureSamplingWithoutReplacementConfig& setMinSamples(uint32 minSamples) = 0; + + /** + * Returns the maximum number of features that are included in a sample. + * + * @return The maximum number of features that are included in a sample + */ + virtual uint32 getMaxSamples() const = 0; + + /** + * Sets the maximum number of features that should be included in a sample. + * + * @param maxSamples The maximum number of features that should be included in a sample. Must be at the value + * returned by `getMaxSamples` or 0, if the number of features should not be restricted + * @return A reference to an object of type `IFeatureSamplingWithoutReplacementConfig` that allows + * further configuration of the method for sampling features + */ + virtual IFeatureSamplingWithoutReplacementConfig& setMaxSamples(uint32 maxSamples) = 0; + /** * Returns the number of trailing features that are always included in a sample. * @@ -63,6 +96,10 @@ class FeatureSamplingWithoutReplacementConfig final : public IFeatureSamplingCon float32 sampleSize_; + uint32 minSamples_; + + uint32 maxSamples_; + uint32 numRetained_; public: @@ -77,6 +114,14 @@ class FeatureSamplingWithoutReplacementConfig final : public IFeatureSamplingCon IFeatureSamplingWithoutReplacementConfig& setSampleSize(float32 sampleSize) override; + uint32 getMinSamples() const override; + + IFeatureSamplingWithoutReplacementConfig& setMinSamples(uint32 minSamples) override; + + uint32 getMaxSamples() const override; + + IFeatureSamplingWithoutReplacementConfig& setMaxSamples(uint32 maxSamples) override; + uint32 getNumRetained() const override; IFeatureSamplingWithoutReplacementConfig& setNumRetained(uint32 numRetained) override; diff --git a/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_stratified_example_wise.hpp b/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_stratified_example_wise.hpp index bd714c3d0..081651f65 100644 --- a/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_stratified_example_wise.hpp +++ b/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_stratified_example_wise.hpp @@ -34,6 +34,39 @@ class MLRLCOMMON_API IExampleWiseStratifiedInstanceSamplingConfig { * allows further configuration of the method for sampling instances */ virtual IExampleWiseStratifiedInstanceSamplingConfig& setSampleSize(float32 sampleSize) = 0; + + /** + * Returns the minimum number of examples that are included in a sample. + * + * @return The minimum number of examples that are included in a sample + */ + virtual uint32 getMinSamples() const = 0; + + /** + * Sets the minimum number of examples that should be included in a sample. + * + * @param minSamples The minimum number of examples that should be included in a sample. Must be at least 1 + * @return A reference to an object of type `IExampleWiseStratifiedInstanceSamplingConfig` that + * allows further configuration of the method for sampling instances + */ + virtual IExampleWiseStratifiedInstanceSamplingConfig& setMinSamples(uint32 minSamples) = 0; + + /** + * Returns the maximum number of examples that are included in a sample. + * + * @return The maximum number of examples that are included in a sample + */ + virtual uint32 getMaxSamples() const = 0; + + /** + * Sets the maximum number of examples that should be included in a sample. + * + * @param maxSamples The maximum number of examples that should be included in a sample. Must be at the value + * returned by `getMaxSamples` or 0, if the number of examples should not be restricted + * @return A reference to an object of type `IExampleWiseStratifiedInstanceSamplingConfig` that + * allows further configuration of the method for sampling instances + */ + virtual IExampleWiseStratifiedInstanceSamplingConfig& setMaxSamples(uint32 maxSamples) = 0; }; /** @@ -48,6 +81,10 @@ class ExampleWiseStratifiedInstanceSamplingConfig final : public IClassification float32 sampleSize_; + uint32 minSamples_; + + uint32 maxSamples_; + public: /** @@ -60,6 +97,14 @@ class ExampleWiseStratifiedInstanceSamplingConfig final : public IClassification IExampleWiseStratifiedInstanceSamplingConfig& setSampleSize(float32 sampleSize) override; + uint32 getMinSamples() const override; + + IExampleWiseStratifiedInstanceSamplingConfig& setMinSamples(uint32 minSamples) override; + + uint32 getMaxSamples() const override; + + IExampleWiseStratifiedInstanceSamplingConfig& setMaxSamples(uint32 maxSamples) override; + std::unique_ptr createClassificationInstanceSamplingFactory() const override; }; diff --git a/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_stratified_output_wise.hpp b/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_stratified_output_wise.hpp index 7b0a778f0..8a1c798e8 100644 --- a/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_stratified_output_wise.hpp +++ b/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_stratified_output_wise.hpp @@ -35,6 +35,39 @@ class MLRLCOMMON_API IOutputWiseStratifiedInstanceSamplingConfig { * allows further configuration of the method for sampling instances */ virtual IOutputWiseStratifiedInstanceSamplingConfig& setSampleSize(float32 sampleSize) = 0; + + /** + * Returns the minimum number of examples that are included in a sample. + * + * @return The minimum number of examples that are included in a sample + */ + virtual uint32 getMinSamples() const = 0; + + /** + * Sets the minimum number of examples that should be included in a sample. + * + * @param minSamples The minimum number of examples that should be included in a sample. Must be at least 1 + * @return A reference to an object of type `IOutputWiseStratifiedInstanceSamplingConfig` that + * allows further configuration of the method for sampling instances + */ + virtual IOutputWiseStratifiedInstanceSamplingConfig& setMinSamples(uint32 minSamples) = 0; + + /** + * Returns the maximum number of examples that are included in a sample. + * + * @return The maximum number of examples that are included in a sample + */ + virtual uint32 getMaxSamples() const = 0; + + /** + * Sets the maximum number of examples that should be included in a sample. + * + * @param maxSamples The maximum number of examples that should be included in a sample. Must be at the value + * returned by `getMaxSamples` or 0, if the number of examples should not be restricted + * @return A reference to an object of type `IOutputWiseStratifiedInstanceSamplingConfig` that + * allows further configuration of the method for sampling instances + */ + virtual IOutputWiseStratifiedInstanceSamplingConfig& setMaxSamples(uint32 maxSamples) = 0; }; /** @@ -49,6 +82,10 @@ class OutputWiseStratifiedInstanceSamplingConfig final : public IClassificationI float32 sampleSize_; + uint32 minSamples_; + + uint32 maxSamples_; + public: /** @@ -61,6 +98,14 @@ class OutputWiseStratifiedInstanceSamplingConfig final : public IClassificationI IOutputWiseStratifiedInstanceSamplingConfig& setSampleSize(float32 sampleSize) override; + uint32 getMinSamples() const override; + + IOutputWiseStratifiedInstanceSamplingConfig& setMinSamples(uint32 minSamples) override; + + uint32 getMaxSamples() const override; + + IOutputWiseStratifiedInstanceSamplingConfig& setMaxSamples(uint32 maxSamples) override; + std::unique_ptr createClassificationInstanceSamplingFactory() const override; }; diff --git a/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_with_replacement.hpp b/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_with_replacement.hpp index 6030380ae..dce707900 100644 --- a/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_with_replacement.hpp +++ b/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_with_replacement.hpp @@ -33,6 +33,39 @@ class MLRLCOMMON_API IInstanceSamplingWithReplacementConfig { * further configuration of the method for sampling instances */ virtual IInstanceSamplingWithReplacementConfig& setSampleSize(float32 sampleSize) = 0; + + /** + * Returns the minimum number of examples that are included in a sample. + * + * @return The minimum number of examples that are included in a sample + */ + virtual uint32 getMinSamples() const = 0; + + /** + * Sets the minimum number of examples that should be included in a sample. + * + * @param minSamples The minimum number of examples that should be included in a sample. Must be at least 1 + * @return A reference to an object of type `IInstanceSamplingWithReplacementConfig` that allows + * further configuration of the method for sampling instances + */ + virtual IInstanceSamplingWithReplacementConfig& setMinSamples(uint32 minSamples) = 0; + + /** + * Returns the maximum number of examples that are included in a sample. + * + * @return The maximum number of examples that are included in a sample + */ + virtual uint32 getMaxSamples() const = 0; + + /** + * Sets the maximum number of examples that should be included in a sample. + * + * @param maxSamples The maximum number of examples that should be included in a sample. Must be at the value + * returned by `getMaxSamples` or 0, if the number of examples should not be restricted + * @return A reference to an object of type `IInstanceSamplingWithReplacementConfig` that allows + * further configuration of the method for sampling instances + */ + virtual IInstanceSamplingWithReplacementConfig& setMaxSamples(uint32 maxSamples) = 0; }; /** @@ -47,6 +80,10 @@ class InstanceSamplingWithReplacementConfig final : public IClassificationInstan float32 sampleSize_; + uint32 minSamples_; + + uint32 maxSamples_; + public: /** @@ -59,6 +96,14 @@ class InstanceSamplingWithReplacementConfig final : public IClassificationInstan IInstanceSamplingWithReplacementConfig& setSampleSize(float32 sampleSize) override; + uint32 getMinSamples() const override; + + IInstanceSamplingWithReplacementConfig& setMinSamples(uint32 minSamples) override; + + uint32 getMaxSamples() const override; + + IInstanceSamplingWithReplacementConfig& setMaxSamples(uint32 maxSamples) override; + std::unique_ptr createClassificationInstanceSamplingFactory() const override; diff --git a/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_without_replacement.hpp b/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_without_replacement.hpp index 4156b90b3..13f08f825 100644 --- a/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_without_replacement.hpp +++ b/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_without_replacement.hpp @@ -33,6 +33,39 @@ class MLRLCOMMON_API IInstanceSamplingWithoutReplacementConfig { * further configuration of the method for sampling instances */ virtual IInstanceSamplingWithoutReplacementConfig& setSampleSize(float32 sampleSize) = 0; + + /** + * Returns the minimum number of examples that are included in a sample. + * + * @return The minimum number of examples that are included in a sample + */ + virtual uint32 getMinSamples() const = 0; + + /** + * Sets the minimum number of examples that should be included in a sample. + * + * @param minSamples The minimum number of examples that should be included in a sample. Must be at least 1 + * @return A reference to an object of type `IInstanceSamplingWithoutReplacementConfig` that allows + * further configuration of the method for sampling instances + */ + virtual IInstanceSamplingWithoutReplacementConfig& setMinSamples(uint32 minSamples) = 0; + + /** + * Returns the maximum number of examples that are included in a sample. + * + * @return The maximum number of examples that are included in a sample + */ + virtual uint32 getMaxSamples() const = 0; + + /** + * Sets the maximum number of examples that should be included in a sample. + * + * @param maxSamples The maximum number of examples that should be included in a sample. Must be at the value + * returned by `getMaxSamples` or 0, if the number of examples should not be restricted + * @return A reference to an object of type `IInstanceSamplingWithoutReplacementConfig` that allows + * further configuration of the method for sampling instances + */ + virtual IInstanceSamplingWithoutReplacementConfig& setMaxSamples(uint32 maxSamples) = 0; }; /** @@ -47,6 +80,10 @@ class InstanceSamplingWithoutReplacementConfig final : public IClassificationIns float32 sampleSize_; + uint32 minSamples_; + + uint32 maxSamples_; + public: /** @@ -59,6 +96,14 @@ class InstanceSamplingWithoutReplacementConfig final : public IClassificationIns IInstanceSamplingWithoutReplacementConfig& setSampleSize(float32 sampleSize) override; + uint32 getMinSamples() const override; + + IInstanceSamplingWithoutReplacementConfig& setMinSamples(uint32 minSamples) override; + + uint32 getMaxSamples() const override; + + IInstanceSamplingWithoutReplacementConfig& setMaxSamples(uint32 maxSamples) override; + std::unique_ptr createClassificationInstanceSamplingFactory() const override; diff --git a/cpp/subprojects/common/include/mlrl/common/sampling/stratified_sampling_example_wise.hpp b/cpp/subprojects/common/include/mlrl/common/sampling/stratified_sampling_example_wise.hpp index dc4c138ab..1e1d97065 100644 --- a/cpp/subprojects/common/include/mlrl/common/sampling/stratified_sampling_example_wise.hpp +++ b/cpp/subprojects/common/include/mlrl/common/sampling/stratified_sampling_example_wise.hpp @@ -46,8 +46,12 @@ class ExampleWiseStratification final { * * @param weightVector A reference to an object of type `BitWeightVector`, the weights should be written to * @param sampleSize The fraction of the available examples to be selected + * @param minSamples The minimum number of examples to be included in the sample. Must be at least 1 + * @param maxSamples The maximum number of examples to be included in the sample. Must be at least + * `minSamples` or 0, if the number of examples should not be restricted */ - void sampleWeights(BitWeightVector& weightVector, float32 sampleSize) const; + void sampleWeights(BitWeightVector& weightVector, float32 sampleSize, uint32 minSamples, + uint32 maxSamples) const; /** * Randomly splits the available examples into two distinct sets and updates a given `BiPartition` accordingly. diff --git a/cpp/subprojects/common/include/mlrl/common/sampling/stratified_sampling_output_wise.hpp b/cpp/subprojects/common/include/mlrl/common/sampling/stratified_sampling_output_wise.hpp index b75f40177..d6bd4dc18 100644 --- a/cpp/subprojects/common/include/mlrl/common/sampling/stratified_sampling_output_wise.hpp +++ b/cpp/subprojects/common/include/mlrl/common/sampling/stratified_sampling_output_wise.hpp @@ -49,8 +49,11 @@ class LabelWiseStratification final { * * @param weightVector A reference to an object of type `BitWeightVector`, the weights should be written to * @param sampleSize The fraction of the available examples to be selected + * @param minSamples The minimum number of examples to be included in the sample. Must be at least 1 + * @param maxSamples The maximum number of examples to be included in the sample. Must be at least + * `minSamples` or 0, if the number of examples should not be restricted */ - void sampleWeights(BitWeightVector& weightVector, float32 sampleSize); + void sampleWeights(BitWeightVector& weightVector, float32 sampleSize, uint32 minSamples, uint32 maxSamples); /** * Randomly splits the available examples into two distinct sets and updates a given `BiPartition` accordingly. diff --git a/cpp/subprojects/common/src/mlrl/common/sampling/feature_sampling_without_replacement.cpp b/cpp/subprojects/common/src/mlrl/common/sampling/feature_sampling_without_replacement.cpp index cbbb8bd65..f48d904f4 100644 --- a/cpp/subprojects/common/src/mlrl/common/sampling/feature_sampling_without_replacement.cpp +++ b/cpp/subprojects/common/src/mlrl/common/sampling/feature_sampling_without_replacement.cpp @@ -4,6 +4,7 @@ #include "mlrl/common/indices/index_vector_partial.hpp" #include "mlrl/common/iterator/iterator_index.hpp" #include "mlrl/common/sampling/feature_sampling_predefined.hpp" +#include "mlrl/common/util/math.hpp" #include "mlrl/common/util/validation.hpp" /** @@ -98,7 +99,7 @@ class FeatureSamplingWithoutReplacementFactory final : public IFeatureSamplingFa }; FeatureSamplingWithoutReplacementConfig::FeatureSamplingWithoutReplacementConfig(ReadableProperty rngConfig) - : rngConfig_(rngConfig), sampleSize_(0), numRetained_(0) {} + : rngConfig_(rngConfig), sampleSize_(0), minSamples_(1), maxSamples_(0), numRetained_(0) {} float32 FeatureSamplingWithoutReplacementConfig::getSampleSize() const { return sampleSize_; @@ -111,6 +112,26 @@ IFeatureSamplingWithoutReplacementConfig& FeatureSamplingWithoutReplacementConfi return *this; } +uint32 FeatureSamplingWithoutReplacementConfig::getMinSamples() const { + return minSamples_; +} + +IFeatureSamplingWithoutReplacementConfig& FeatureSamplingWithoutReplacementConfig::setMinSamples(uint32 minSamples) { + util::assertGreaterOrEqual("minSamples", minSamples, 1); + minSamples_ = minSamples; + return *this; +} + +uint32 FeatureSamplingWithoutReplacementConfig::getMaxSamples() const { + return maxSamples_; +} + +IFeatureSamplingWithoutReplacementConfig& FeatureSamplingWithoutReplacementConfig::setMaxSamples(uint32 maxSamples) { + if (maxSamples != 0) util::assertGreaterOrEqual("maxSamples", maxSamples, minSamples_); + maxSamples_ = maxSamples; + return *this; +} + uint32 FeatureSamplingWithoutReplacementConfig::getNumRetained() const { return numRetained_; } @@ -126,8 +147,14 @@ std::unique_ptr FeatureSamplingWithoutReplacementConfig uint32 numFeatures = featureMatrix.getNumFeatures(); uint32 numRetained = std::min(numRetained_, numFeatures); uint32 numRemainingFeatures = numFeatures - numRetained; - uint32 numSamples = - static_cast(sampleSize_ > 0 ? sampleSize_ * numRemainingFeatures : log2(numRemainingFeatures - 1) + 1); + uint32 numSamples; + + if (sampleSize_ > 0) { + numSamples = util::calculateBoundedFraction(numRemainingFeatures, sampleSize_, minSamples_, maxSamples_); + } else { + numSamples = static_cast(log2(numRemainingFeatures - 1) + 1); + } + return std::make_unique(rngConfig_.get().createRNGFactory(), numFeatures, numSamples, numRetained); } diff --git a/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_stratified_example_wise.cpp b/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_stratified_example_wise.cpp index 0d9cafca9..24ab67893 100644 --- a/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_stratified_example_wise.cpp +++ b/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_stratified_example_wise.cpp @@ -19,6 +19,10 @@ class ExampleWiseStratifiedSampling final : public IInstanceSampling { const float32 sampleSize_; + const uint32 minSamples_; + + const uint32 maxSamples_; + BitWeightVector weightVector_; const ExampleWiseStratification stratification_; @@ -36,15 +40,19 @@ class ExampleWiseStratifiedSampling final : public IInstanceSampling { * training set * @param sampleSize The fraction of examples to be included in the sample (e.g. a value of 0.6 * corresponds to 60 % of the available examples). Must be in (0, 1] + * @param minSamples The minimum number of examples to be included in the sample. Must be at least 1 + * @param maxSamples The maximum number of examples to be included in the sample. Must be at least + * `minSamples` or 0, if the number of examples should not be restricted */ ExampleWiseStratifiedSampling(std::unique_ptr rngPtr, const LabelMatrix& labelMatrix, - IndexIterator indicesBegin, IndexIterator indicesEnd, float32 sampleSize) - : sampleSize_(sampleSize), + IndexIterator indicesBegin, IndexIterator indicesEnd, float32 sampleSize, + uint32 minSamples, uint32 maxSamples) + : sampleSize_(sampleSize), minSamples_(minSamples), maxSamples_(maxSamples), weightVector_(labelMatrix.numRows, static_cast(indicesEnd - indicesBegin) < labelMatrix.numRows), stratification_(std::move(rngPtr), labelMatrix, indicesBegin, indicesEnd) {} const IWeightVector& sample() override { - stratification_.sampleWeights(weightVector_, sampleSize_); + stratification_.sampleWeights(weightVector_, sampleSize_, minSamples_, maxSamples_); return weightVector_; } }; @@ -60,6 +68,10 @@ class ExampleWiseStratifiedInstanceSamplingFactory final : public IClassificatio const float32 sampleSize_; + const uint32 minSamples_; + + const uint32 maxSamples_; + public: /** @@ -67,41 +79,50 @@ class ExampleWiseStratifiedInstanceSamplingFactory final : public IClassificatio * generators * @param sampleSize The fraction of examples to be included in the sample (e.g. a value of 0.6 corresponds * to 60 % of the available examples). Must be in (0, 1] + * @param minSamples The minimum number of examples to be included in the sample. Must be at least 1 + * @param maxSamples The maximum number of examples to be included in the sample. Must be at least + * `minSamples` or 0, if the number of examples should not be restricted */ - ExampleWiseStratifiedInstanceSamplingFactory(std::unique_ptr rngFactoryPtr, float32 sampleSize) - : rngFactoryPtr_(std::move(rngFactoryPtr)), sampleSize_(sampleSize) {} + ExampleWiseStratifiedInstanceSamplingFactory(std::unique_ptr rngFactoryPtr, float32 sampleSize, + uint32 minSamples, uint32 maxSamples) + : rngFactoryPtr_(std::move(rngFactoryPtr)), sampleSize_(sampleSize), minSamples_(minSamples), + maxSamples_(maxSamples) {} std::unique_ptr create(const CContiguousView& labelMatrix, const SinglePartition& partition, IStatistics& statistics) const override { return std::make_unique< ExampleWiseStratifiedSampling, SinglePartition::const_iterator>>( - rngFactoryPtr_->create(), labelMatrix, partition.cbegin(), partition.cend(), sampleSize_); + rngFactoryPtr_->create(), labelMatrix, partition.cbegin(), partition.cend(), sampleSize_, minSamples_, + maxSamples_); } std::unique_ptr create(const CContiguousView& labelMatrix, BiPartition& partition, IStatistics& statistics) const override { return std::make_unique< ExampleWiseStratifiedSampling, BiPartition::const_iterator>>( - rngFactoryPtr_->create(), labelMatrix, partition.first_cbegin(), partition.first_cend(), sampleSize_); + rngFactoryPtr_->create(), labelMatrix, partition.first_cbegin(), partition.first_cend(), sampleSize_, + minSamples_, maxSamples_); } std::unique_ptr create(const BinaryCsrView& labelMatrix, const SinglePartition& partition, IStatistics& statistics) const override { return std::make_unique>( - rngFactoryPtr_->create(), labelMatrix, partition.cbegin(), partition.cend(), sampleSize_); + rngFactoryPtr_->create(), labelMatrix, partition.cbegin(), partition.cend(), sampleSize_, minSamples_, + maxSamples_); } std::unique_ptr create(const BinaryCsrView& labelMatrix, BiPartition& partition, IStatistics& statistics) const override { return std::make_unique>( - rngFactoryPtr_->create(), labelMatrix, partition.first_cbegin(), partition.first_cend(), sampleSize_); + rngFactoryPtr_->create(), labelMatrix, partition.first_cbegin(), partition.first_cend(), sampleSize_, + minSamples_, maxSamples_); } }; ExampleWiseStratifiedInstanceSamplingConfig::ExampleWiseStratifiedInstanceSamplingConfig( ReadableProperty rngConfig) - : rngConfig_(rngConfig), sampleSize_(0.66f) {} + : rngConfig_(rngConfig), sampleSize_(0.66f), minSamples_(1), maxSamples_(0) {} float32 ExampleWiseStratifiedInstanceSamplingConfig::getSampleSize() const { return sampleSize_; @@ -115,8 +136,30 @@ IExampleWiseStratifiedInstanceSamplingConfig& ExampleWiseStratifiedInstanceSampl return *this; } +uint32 ExampleWiseStratifiedInstanceSamplingConfig::getMinSamples() const { + return minSamples_; +} + +IExampleWiseStratifiedInstanceSamplingConfig& ExampleWiseStratifiedInstanceSamplingConfig::setMinSamples( + uint32 minSamples) { + util::assertGreaterOrEqual("minSamples", minSamples, 1); + minSamples_ = minSamples; + return *this; +} + +uint32 ExampleWiseStratifiedInstanceSamplingConfig::getMaxSamples() const { + return maxSamples_; +} + +IExampleWiseStratifiedInstanceSamplingConfig& ExampleWiseStratifiedInstanceSamplingConfig::setMaxSamples( + uint32 maxSamples) { + if (maxSamples != 0) util::assertGreaterOrEqual("maxSamples", maxSamples, minSamples_); + maxSamples_ = maxSamples; + return *this; +} + std::unique_ptr ExampleWiseStratifiedInstanceSamplingConfig::createClassificationInstanceSamplingFactory() const { return std::make_unique(rngConfig_.get().createRNGFactory(), - sampleSize_); + sampleSize_, minSamples_, maxSamples_); } diff --git a/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_stratified_output_wise.cpp b/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_stratified_output_wise.cpp index 4a675374c..8ae84de22 100644 --- a/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_stratified_output_wise.cpp +++ b/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_stratified_output_wise.cpp @@ -20,6 +20,10 @@ class OutputWiseStratifiedSampling final : public IInstanceSampling { const float32 sampleSize_; + const uint32 minSamples_; + + const uint32 maxSamples_; + BitWeightVector weightVector_; LabelWiseStratification stratification_; @@ -39,13 +43,14 @@ class OutputWiseStratifiedSampling final : public IInstanceSampling { * corresponds to 60 % of the available examples). Must be in (0, 1] */ OutputWiseStratifiedSampling(std::unique_ptr rngPtr, const LabelMatrix& labelMatrix, - IndexIterator indicesBegin, IndexIterator indicesEnd, float32 sampleSize) - : sampleSize_(sampleSize), + IndexIterator indicesBegin, IndexIterator indicesEnd, float32 sampleSize, + uint32 minSamples, uint32 maxSamples) + : sampleSize_(sampleSize), minSamples_(minSamples), maxSamples_(maxSamples), weightVector_(labelMatrix.numRows, static_cast(indicesEnd - indicesBegin) < labelMatrix.numRows), stratification_(std::move(rngPtr), labelMatrix, indicesBegin, indicesEnd) {} const IWeightVector& sample() override { - stratification_.sampleWeights(weightVector_, sampleSize_); + stratification_.sampleWeights(weightVector_, sampleSize_, minSamples_, maxSamples_); return weightVector_; } }; @@ -62,6 +67,10 @@ class OutputWiseStratifiedInstanceSamplingFactory final : public IClassification const float32 sampleSize_; + const uint32 minSamples_; + + const uint32 maxSamples_; + public: /** @@ -69,41 +78,50 @@ class OutputWiseStratifiedInstanceSamplingFactory final : public IClassification * generators * @param sampleSize The fraction of examples to be included in the sample (e.g. a value of 0.6 corresponds * to 60 % of the available examples). Must be in (0, 1] + * @param minSamples The minimum number of examples to be included in the sample. Must be at least 1 + * @param maxSamples The maximum number of examples to be included in the sample. Must be at least + * `minSamples` or 0, if the number of examples should not be restricted */ - OutputWiseStratifiedInstanceSamplingFactory(std::unique_ptr rngFactoryPtr, float32 sampleSize) - : rngFactoryPtr_(std::move(rngFactoryPtr)), sampleSize_(sampleSize) {} + OutputWiseStratifiedInstanceSamplingFactory(std::unique_ptr rngFactoryPtr, float32 sampleSize, + uint32 minSamples, uint32 maxSamples) + : rngFactoryPtr_(std::move(rngFactoryPtr)), sampleSize_(sampleSize), minSamples_(minSamples), + maxSamples_(maxSamples) {} std::unique_ptr create(const CContiguousView& labelMatrix, const SinglePartition& partition, IStatistics& statistics) const override { return std::make_unique< OutputWiseStratifiedSampling, SinglePartition::const_iterator>>( - rngFactoryPtr_->create(), labelMatrix, partition.cbegin(), partition.cend(), sampleSize_); + rngFactoryPtr_->create(), labelMatrix, partition.cbegin(), partition.cend(), sampleSize_, minSamples_, + maxSamples_); } std::unique_ptr create(const CContiguousView& labelMatrix, BiPartition& partition, IStatistics& statistics) const override { return std::make_unique< OutputWiseStratifiedSampling, BiPartition::const_iterator>>( - rngFactoryPtr_->create(), labelMatrix, partition.first_cbegin(), partition.first_cend(), sampleSize_); + rngFactoryPtr_->create(), labelMatrix, partition.first_cbegin(), partition.first_cend(), sampleSize_, + minSamples_, maxSamples_); } std::unique_ptr create(const BinaryCsrView& labelMatrix, const SinglePartition& partition, IStatistics& statistics) const override { return std::make_unique>( - rngFactoryPtr_->create(), labelMatrix, partition.cbegin(), partition.cend(), sampleSize_); + rngFactoryPtr_->create(), labelMatrix, partition.cbegin(), partition.cend(), sampleSize_, minSamples_, + maxSamples_); } std::unique_ptr create(const BinaryCsrView& labelMatrix, BiPartition& partition, IStatistics& statistics) const override { return std::make_unique>( - rngFactoryPtr_->create(), labelMatrix, partition.first_cbegin(), partition.first_cend(), sampleSize_); + rngFactoryPtr_->create(), labelMatrix, partition.first_cbegin(), partition.first_cend(), sampleSize_, + minSamples_, maxSamples_); } }; OutputWiseStratifiedInstanceSamplingConfig::OutputWiseStratifiedInstanceSamplingConfig( ReadableProperty rngConfig) - : rngConfig_(rngConfig), sampleSize_(0.66f) {} + : rngConfig_(rngConfig), sampleSize_(0.66f), minSamples_(1), maxSamples_(0) {} float32 OutputWiseStratifiedInstanceSamplingConfig::getSampleSize() const { return sampleSize_; @@ -117,8 +135,30 @@ IOutputWiseStratifiedInstanceSamplingConfig& OutputWiseStratifiedInstanceSamplin return *this; } +uint32 OutputWiseStratifiedInstanceSamplingConfig::getMinSamples() const { + return minSamples_; +} + +IOutputWiseStratifiedInstanceSamplingConfig& OutputWiseStratifiedInstanceSamplingConfig::setMinSamples( + uint32 minSamples) { + util::assertGreaterOrEqual("minSamples", minSamples, 1); + minSamples_ = minSamples; + return *this; +} + +uint32 OutputWiseStratifiedInstanceSamplingConfig::getMaxSamples() const { + return maxSamples_; +} + +IOutputWiseStratifiedInstanceSamplingConfig& OutputWiseStratifiedInstanceSamplingConfig::setMaxSamples( + uint32 maxSamples) { + if (maxSamples != 0) util::assertGreaterOrEqual("maxSamples", maxSamples, minSamples_); + maxSamples_ = maxSamples; + return *this; +} + std::unique_ptr OutputWiseStratifiedInstanceSamplingConfig::createClassificationInstanceSamplingFactory() const { return std::make_unique(rngConfig_.get().createRNGFactory(), - sampleSize_); + sampleSize_, minSamples_, maxSamples_); } diff --git a/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_with_replacement.cpp b/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_with_replacement.cpp index c84dbbf5a..3e1e71e2e 100644 --- a/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_with_replacement.cpp +++ b/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_with_replacement.cpp @@ -3,12 +3,13 @@ #include "mlrl/common/sampling/partition_bi.hpp" #include "mlrl/common/sampling/partition_single.hpp" #include "mlrl/common/sampling/weight_vector_dense.hpp" +#include "mlrl/common/util/math.hpp" #include "mlrl/common/util/validation.hpp" -static inline void sampleInternally(const SinglePartition& partition, float32 sampleSize, - DenseWeightVector& weightVector, RNG& rng) { +static inline void sampleInternally(const SinglePartition& partition, float32 sampleSize, uint32 minSamples, + uint32 maxSamples, DenseWeightVector& weightVector, RNG& rng) { uint32 numExamples = partition.getNumElements(); - uint32 numSamples = static_cast(sampleSize * numExamples); + uint32 numSamples = util::calculateBoundedFraction(numExamples, sampleSize, minSamples, maxSamples); typename DenseWeightVector::iterator weightIterator = weightVector.begin(); util::setViewToZeros(weightIterator, numExamples); uint32 numNonZeroWeights = 0; @@ -29,11 +30,11 @@ static inline void sampleInternally(const SinglePartition& partition, float32 sa weightVector.setNumNonZeroWeights(numNonZeroWeights); } -static inline void sampleInternally(BiPartition& partition, float32 sampleSize, DenseWeightVector& weightVector, - RNG& rng) { +static inline void sampleInternally(BiPartition& partition, float32 sampleSize, uint32 minSamples, uint32 maxSamples, + DenseWeightVector& weightVector, RNG& rng) { uint32 numExamples = partition.getNumElements(); uint32 numTrainingExamples = partition.getNumFirst(); - uint32 numSamples = static_cast(sampleSize * numTrainingExamples); + uint32 numSamples = util::calculateBoundedFraction(numTrainingExamples, sampleSize, minSamples, maxSamples); BiPartition::const_iterator indexIterator = partition.first_cbegin(); typename DenseWeightVector::iterator weightIterator = weightVector.begin(); util::setViewToZeros(weightIterator, numExamples); @@ -72,24 +73,32 @@ class InstanceSamplingWithReplacement final : public IInstanceSampling { const float32 sampleSize_; + const uint32 minSamples_; + + const uint32 maxSamples_; + DenseWeightVector weightVector_; public: /** - * @param rngPtr An unique pointer to an object of type `RNG` that should be used for generating random - * numbers - * @param partition A reference to an object of template type `Partition` that provides access to the indices - * of the examples that are included in the training set - * @param sampleSize The fraction of examples to be included in the sample (e.g. a value of 0.6 corresponds to - * 60 % of the available examples). Must be in (0, 1] + * @param rngPtr An unique pointer to an object of type `RNG` that should be used for generating random + * numbers + * @param partition A reference to an object of template type `Partition` that provides access to the + * indices of the examples that are included in the training set + * @param sampleSize The fraction of examples to be included in the sample (e.g. a value of 0.6 corresponds + * to 60 % of the available examples). Must be in (0, 1] + * @param minSamples The minimum number of examples to be included in the sample. Must be at least 1 + * @param maxSamples The maximum number of examples to be included in the sample. Must be at least + * `minSamples` or 0, if the number of examples should not be restricted */ - InstanceSamplingWithReplacement(std::unique_ptr rngPtr, Partition& partition, float32 sampleSize) - : rngPtr_(std::move(rngPtr)), partition_(partition), sampleSize_(sampleSize), - weightVector_(partition.getNumElements()) {} + InstanceSamplingWithReplacement(std::unique_ptr rngPtr, Partition& partition, float32 sampleSize, + uint32 minSamples, uint32 maxSamples) + : rngPtr_(std::move(rngPtr)), partition_(partition), sampleSize_(sampleSize), minSamples_(minSamples), + maxSamples_(maxSamples), weightVector_(partition.getNumElements()) {} const IWeightVector& sample() override { - sampleInternally(partition_, sampleSize_, weightVector_, *rngPtr_); + sampleInternally(partition_, sampleSize_, minSamples_, maxSamples_, weightVector_, *rngPtr_); return weightVector_; } }; @@ -106,6 +115,10 @@ class InstanceSamplingWithReplacementFactory final : public IClassificationInsta const float32 sampleSize_; + const uint32 minSamples_; + + const uint32 maxSamples_; + public: /** @@ -113,64 +126,69 @@ class InstanceSamplingWithReplacementFactory final : public IClassificationInsta * generators * @param sampleSize The fraction of examples to be included in the sample (e.g. a value of 0.6 corresponds * to 60 % of the available examples). Must be in (0, 1] + * @param minSamples The minimum number of examples to be included in the sample. Must be at least 1 + * @param maxSamples The maximum number of examples to be included in the sample. Must be at least + * `minSamples` or 0, if the number of examples should not be restricted */ - InstanceSamplingWithReplacementFactory(std::unique_ptr rngFactoryPtr, float32 sampleSize) - : rngFactoryPtr_(std::move(rngFactoryPtr)), sampleSize_(sampleSize) {} + InstanceSamplingWithReplacementFactory(std::unique_ptr rngFactoryPtr, float32 sampleSize, + uint32 minSamples, uint32 maxSamples) + : rngFactoryPtr_(std::move(rngFactoryPtr)), sampleSize_(sampleSize), minSamples_(minSamples), + maxSamples_(maxSamples) {} std::unique_ptr create(const CContiguousView& labelMatrix, const SinglePartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), - partition, sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const CContiguousView& labelMatrix, BiPartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), partition, - sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const BinaryCsrView& labelMatrix, const SinglePartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), - partition, sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const BinaryCsrView& labelMatrix, BiPartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), partition, - sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const CContiguousView& regressionMatrix, const SinglePartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), - partition, sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const CContiguousView& regressionMatrix, BiPartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), partition, - sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const CsrView& regressionMatrix, const SinglePartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), - partition, sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const CsrView& regressionMatrix, BiPartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), partition, - sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } }; InstanceSamplingWithReplacementConfig::InstanceSamplingWithReplacementConfig(ReadableProperty rngConfig) - : rngConfig_(rngConfig), sampleSize_(0.66f) {} + : rngConfig_(rngConfig), sampleSize_(0.66f), minSamples_(1), maxSamples_(0) {} float32 InstanceSamplingWithReplacementConfig::getSampleSize() const { return sampleSize_; @@ -183,12 +201,34 @@ IInstanceSamplingWithReplacementConfig& InstanceSamplingWithReplacementConfig::s return *this; } +uint32 InstanceSamplingWithReplacementConfig::getMinSamples() const { + return minSamples_; +} + +IInstanceSamplingWithReplacementConfig& InstanceSamplingWithReplacementConfig::setMinSamples(uint32 minSamples) { + util::assertGreaterOrEqual("minSamples", minSamples, 1); + minSamples_ = minSamples; + return *this; +} + +uint32 InstanceSamplingWithReplacementConfig::getMaxSamples() const { + return maxSamples_; +} + +IInstanceSamplingWithReplacementConfig& InstanceSamplingWithReplacementConfig::setMaxSamples(uint32 maxSamples) { + if (maxSamples != 0) util::assertGreaterOrEqual("maxSamples", maxSamples, minSamples_); + maxSamples_ = maxSamples; + return *this; +} + std::unique_ptr InstanceSamplingWithReplacementConfig::createClassificationInstanceSamplingFactory() const { - return std::make_unique(rngConfig_.get().createRNGFactory(), sampleSize_); + return std::make_unique(rngConfig_.get().createRNGFactory(), sampleSize_, + minSamples_, maxSamples_); } std::unique_ptr InstanceSamplingWithReplacementConfig::createRegressionInstanceSamplingFactory() const { - return std::make_unique(rngConfig_.get().createRNGFactory(), sampleSize_); + return std::make_unique(rngConfig_.get().createRNGFactory(), sampleSize_, + minSamples_, maxSamples_); } diff --git a/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_without_replacement.cpp b/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_without_replacement.cpp index 725d80733..52fd45032 100644 --- a/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_without_replacement.cpp +++ b/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_without_replacement.cpp @@ -4,20 +4,21 @@ #include "mlrl/common/sampling/partition_bi.hpp" #include "mlrl/common/sampling/partition_single.hpp" #include "mlrl/common/sampling/weight_sampling.hpp" +#include "mlrl/common/util/math.hpp" #include "mlrl/common/util/validation.hpp" -static inline void sampleInternally(const SinglePartition& partition, float32 sampleSize, BitWeightVector& weightVector, - RNG& rng) { +static inline void sampleInternally(const SinglePartition& partition, float32 sampleSize, uint32 minSamples, + uint32 maxSamples, BitWeightVector& weightVector, RNG& rng) { uint32 numExamples = partition.getNumElements(); - uint32 numSamples = static_cast(sampleSize * numExamples); + uint32 numSamples = util::calculateBoundedFraction(numExamples, sampleSize, minSamples, maxSamples); sampleWeightsWithoutReplacement(weightVector, partition.cbegin(), numExamples, numSamples, rng); } -static inline void sampleInternally(BiPartition& partition, float32 sampleSize, BitWeightVector& weightVector, - RNG& rng) { +static inline void sampleInternally(BiPartition& partition, float32 sampleSize, uint32 minSamples, uint32 maxSamples, + BitWeightVector& weightVector, RNG& rng) { uint32 numTrainingExamples = partition.getNumFirst(); - uint32 numSamples = static_cast(sampleSize * numTrainingExamples); + uint32 numSamples = util::calculateBoundedFraction(numTrainingExamples, sampleSize, minSamples, maxSamples); sampleWeightsWithoutReplacement(weightVector, partition.first_cbegin(), numTrainingExamples, numSamples, rng); } @@ -38,6 +39,10 @@ class InstanceSamplingWithoutReplacement final : public IInstanceSampling { const float32 sampleSize_; + const uint32 minSamples_; + + const uint32 maxSamples_; + BitWeightVector weightVector_; public: @@ -49,13 +54,17 @@ class InstanceSamplingWithoutReplacement final : public IInstanceSampling { * indices of the examples that are included in the training set * @param sampleSize The fraction of examples to be included in the sample (e.g. a value of 0.6 corresponds * to 60 % of the available examples). Must be in (0, 1) + * @param minSamples The minimum number of examples to be included in the sample. Must be at least 1 + * @param maxSamples The maximum number of examples to be included in the sample. Must be at least + * `minSamples` or 0, if the number of examples should not be restricted */ - InstanceSamplingWithoutReplacement(std::unique_ptr rngPtr, Partition& partition, float32 sampleSize) - : rngPtr_(std::move(rngPtr)), partition_(partition), sampleSize_(sampleSize), - weightVector_(partition.getNumElements()) {} + InstanceSamplingWithoutReplacement(std::unique_ptr rngPtr, Partition& partition, float32 sampleSize, + uint32 minSamples, uint32 maxSamples) + : rngPtr_(std::move(rngPtr)), partition_(partition), sampleSize_(sampleSize), minSamples_(minSamples), + maxSamples_(maxSamples), weightVector_(partition.getNumElements()) {} const IWeightVector& sample() override { - sampleInternally(partition_, sampleSize_, weightVector_, *rngPtr_); + sampleInternally(partition_, sampleSize_, minSamples_, maxSamples_, weightVector_, *rngPtr_); return weightVector_; } }; @@ -72,6 +81,10 @@ class InstanceSamplingWithoutReplacementFactory final : public IClassificationIn const float32 sampleSize_; + const uint32 minSamples_; + + const uint32 maxSamples_; + public: /** @@ -79,65 +92,70 @@ class InstanceSamplingWithoutReplacementFactory final : public IClassificationIn * generators * @param sampleSize The fraction of examples to be included in the sample (e.g. a value of 0.6 corresponds * to 60 % of the available examples). Must be in (0, 1) + * @param minSamples The minimum number of examples to be included in the sample. Must be at least 1 + * @param maxSamples The maximum number of examples to be included in the sample. Must be at least + * `minSamples` or 0, if the number of examples should not be restricted */ - InstanceSamplingWithoutReplacementFactory(std::unique_ptr rngFactoryPtr, float32 sampleSize) - : rngFactoryPtr_(std::move(rngFactoryPtr)), sampleSize_(sampleSize) {} + InstanceSamplingWithoutReplacementFactory(std::unique_ptr rngFactoryPtr, float32 sampleSize, + uint32 minSamples, uint32 maxSamples) + : rngFactoryPtr_(std::move(rngFactoryPtr)), sampleSize_(sampleSize), minSamples_(minSamples), + maxSamples_(maxSamples) {} std::unique_ptr create(const CContiguousView& labelMatrix, const SinglePartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), - partition, sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const CContiguousView& labelMatrix, BiPartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), - partition, sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const BinaryCsrView& labelMatrix, const SinglePartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), - partition, sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const BinaryCsrView& labelMatrix, BiPartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), - partition, sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const CContiguousView& regressionMatrix, const SinglePartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), - partition, sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const CContiguousView& regressionMatrix, BiPartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), - partition, sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const CsrView& regressionMatrix, const SinglePartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), - partition, sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const CsrView& regressionMatrix, BiPartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), - partition, sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } }; InstanceSamplingWithoutReplacementConfig::InstanceSamplingWithoutReplacementConfig( ReadableProperty rngConfig) - : rngConfig_(rngConfig), sampleSize_(0.66f) {} + : rngConfig_(rngConfig), sampleSize_(0.66f), minSamples_(1), maxSamples_(0) {} float32 InstanceSamplingWithoutReplacementConfig::getSampleSize() const { return sampleSize_; @@ -150,14 +168,34 @@ IInstanceSamplingWithoutReplacementConfig& InstanceSamplingWithoutReplacementCon return *this; } +uint32 InstanceSamplingWithoutReplacementConfig::getMinSamples() const { + return minSamples_; +} + +IInstanceSamplingWithoutReplacementConfig& InstanceSamplingWithoutReplacementConfig::setMinSamples(uint32 minSamples) { + util::assertGreaterOrEqual("minSamples", minSamples, 1); + minSamples_ = minSamples; + return *this; +} + +uint32 InstanceSamplingWithoutReplacementConfig::getMaxSamples() const { + return maxSamples_; +} + +IInstanceSamplingWithoutReplacementConfig& InstanceSamplingWithoutReplacementConfig::setMaxSamples(uint32 maxSamples) { + if (maxSamples != 0) util::assertGreaterOrEqual("maxSamples", maxSamples, minSamples_); + maxSamples_ = maxSamples; + return *this; +} + std::unique_ptr InstanceSamplingWithoutReplacementConfig::createClassificationInstanceSamplingFactory() const { - return std::make_unique(rngConfig_.get().createRNGFactory(), - sampleSize_); + return std::make_unique(rngConfig_.get().createRNGFactory(), sampleSize_, + minSamples_, maxSamples_); } std::unique_ptr InstanceSamplingWithoutReplacementConfig::createRegressionInstanceSamplingFactory() const { - return std::make_unique(rngConfig_.get().createRNGFactory(), - sampleSize_); + return std::make_unique(rngConfig_.get().createRNGFactory(), sampleSize_, + minSamples_, maxSamples_); } diff --git a/cpp/subprojects/common/src/mlrl/common/sampling/stratified_sampling_example_wise.cpp b/cpp/subprojects/common/src/mlrl/common/sampling/stratified_sampling_example_wise.cpp index 18b78a3ca..d5fb5d377 100644 --- a/cpp/subprojects/common/src/mlrl/common/sampling/stratified_sampling_example_wise.cpp +++ b/cpp/subprojects/common/src/mlrl/common/sampling/stratified_sampling_example_wise.cpp @@ -1,6 +1,7 @@ #include "mlrl/common/sampling/stratified_sampling_example_wise.hpp" #include "mlrl/common/sampling/partition_single.hpp" +#include "mlrl/common/util/math.hpp" #include "stratified_sampling_common.hpp" #include @@ -46,8 +47,9 @@ ExampleWiseStratification::ExampleWiseStratification template void ExampleWiseStratification::sampleWeights(BitWeightVector& weightVector, - float32 sampleSize) const { - uint32 numTotalSamples = static_cast(std::round(sampleSize * numTotal_)); + float32 sampleSize, uint32 minSamples, + uint32 maxSamples) const { + uint32 numTotalSamples = util::calculateBoundedFraction(numTotal_, sampleSize, minSamples, maxSamples); uint32 numTotalOutOfSamples = numTotal_ - numTotalSamples; uint32 numNonZeroWeights = 0; uint32 numZeroWeights = 0; diff --git a/cpp/subprojects/common/src/mlrl/common/sampling/stratified_sampling_output_wise.cpp b/cpp/subprojects/common/src/mlrl/common/sampling/stratified_sampling_output_wise.cpp index fc487ae9e..446b272f9 100644 --- a/cpp/subprojects/common/src/mlrl/common/sampling/stratified_sampling_output_wise.cpp +++ b/cpp/subprojects/common/src/mlrl/common/sampling/stratified_sampling_output_wise.cpp @@ -3,6 +3,7 @@ #include "mlrl/common/data/array.hpp" #include "mlrl/common/data/indexed_value.hpp" #include "mlrl/common/sampling/partition_single.hpp" +#include "mlrl/common/util/math.hpp" #include "stratified_sampling_common.hpp" #include @@ -376,10 +377,11 @@ LabelWiseStratification::LabelWiseStratification(std template void LabelWiseStratification::sampleWeights(BitWeightVector& weightVector, - float32 sampleSize) { + float32 sampleSize, uint32 minSamples, + uint32 maxSamples) { uint32 numRows = stratificationMatrix_.getNumRows(); uint32 numCols = stratificationMatrix_.getNumCols(); - uint32 numTotalSamples = static_cast(std::round(sampleSize * numRows)); + uint32 numTotalSamples = util::calculateBoundedFraction(numRows, sampleSize, minSamples, maxSamples); uint32 numTotalOutOfSamples = numRows - numTotalSamples; uint32 numNonZeroWeights = 0; uint32 numZeroWeights = 0; diff --git a/doc/user_guide/boosting/parameters.md b/doc/user_guide/boosting/parameters.md index b626be2c6..c1ccbcc8b 100644 --- a/doc/user_guide/boosting/parameters.md +++ b/doc/user_guide/boosting/parameters.md @@ -427,6 +427,14 @@ The seed to be used by random number generators. The given value must be at leas The percentage of features to be included in a sample. For example, a value of 0.6 corresponds to 60% of the features. The given value must be in (0, 1\] or 0, if the sample size should be calculated as log2(A - 1) + 1), where A denotes the number of available features. + - `min_samples` *(Default value = `1`)* + + The minimum number of features to be included in a sample. The given value must be at least 1. + + - `max_samples` *(Default value = `0`)* + + The maximum number of features to be included in a sample. The given value must be at least the value of `min_samples` or 0, if the number of features should not be restricted. + - `num_retained` *(Default value = `0`)* The number of trailing features to be always included in a sample. For example, a value of 2 means that the last two features are always retained. @@ -449,6 +457,14 @@ The seed to be used by random number generators. The given value must be at leas The percentage of examples to be included in a sample. For example, a value of 0.6 corresponds to 60% of the available examples. The given value must be in the range (0, 1). + - `min_samples` *(Default value = `1`)* + + The minimum number of examples to be included in a sample. The given value must be at least 1. + + - `max_samples` *(Default value = `0`)* + + The maximum number of examples to be included in a sample. The given value must be at least the value of `min_samples` or 0, if the number of examples should not be restricted. + `'without-replacement'` The training examples to be considered for learning a new rule are selected randomly without replacement. The following options may be provided using the {ref}`bracket notation`: @@ -456,6 +472,14 @@ The seed to be used by random number generators. The given value must be at leas The percentage of examples to be included in a sample. For example, a value of 0.6 corresponds to 60% of the available examples. The given value must be in the range (0, 1). + - `min_samples` *(Default value = `1`)* + + The minimum number of examples to be included in a sample. The given value must be at least 1. + + - `max_samples` *(Default value = `0`)* + + The maximum number of examples to be included in a sample. The given value must be at least the value of `min_samples` or 0, if the number of examples should not be restricted. + `'stratified-output-wise'` *(classification only)* The training examples to be considered for learning a new rule are selected according to an iterative stratified sampling method that ensures that for each label the proportion of relevant and irrelevant examples is maintained. The following options may be provided using the {ref}`bracket notation`: @@ -463,12 +487,28 @@ The seed to be used by random number generators. The given value must be at leas The percentage of examples to be included in a sample. For example, a value of 0.6 corresponds to 60% of the available examples. The given value must be in the range (0, 1). + - `min_samples` *(Default value = `1`)* + + The minimum number of examples to be included in a sample. The given value must be at least 1. + + - `max_samples` *(Default value = `0`)* + + The maximum number of examples to be included in a sample. The given value must be at least the value of `min_samples` or 0, if the number of examples should not be restricted. + `'stratified-example-wise'` (*classification only*) The training examples to be considered for learning a new rule are selected according to stratified sampling method, where distinct label vectors are treated as individual classes. The following options may be provided using the {ref}`bracket notation`: - `sample_size` *(Default value = `0.66`)* The percentage of examples to be included in a sample. For example, a value of 0.6 corresponds to 60% of the available examples. The given value must be in the range (0, 1). + + - `min_samples` *(Default value = `1`)* + + The minimum number of examples to be included in a sample. The given value must be at least 1. + + - `max_samples` *(Default value = `0`)* + + The maximum number of examples to be included in a sample. The given value must be at least the value of `min_samples` or 0, if the number of examples should not be restricted. ``` ## Approximations and Optimizations diff --git a/doc/user_guide/seco/parameters.md b/doc/user_guide/seco/parameters.md index 7f2cd1aea..5791f21b3 100644 --- a/doc/user_guide/seco/parameters.md +++ b/doc/user_guide/seco/parameters.md @@ -357,6 +357,14 @@ The seed to be used by random number generators. The given value must be at leas The percentage of features to be included in a sample. For example, a value of 0.6 corresponds to 60% of the features. The given value must be in (0, 1\] or 0, if the sample size should be calculated as log2(A - 1) + 1), where A denotes the number of available features. + - `min_samples` *(Default value = `1`)* + + The minimum number of features to be included in a sample. The given value must be at least 1. + + - `max_samples` *(Default value = `0`)* + + The maximum number of features to be included in a sample. The given value must be at least the value of `min_samples` or 0, if the number of features should not be restricted. + - `num_retained` *(Default value = `0`)* The number of trailing features to be always included in a sample. For example, a value of 2 means that the last two features are always retained. @@ -379,6 +387,14 @@ The seed to be used by random number generators. The given value must be at leas The percentage of examples to be included in a sample. For example, a value of 0.6 corresponds to 60% of the available examples. The given value must be in the range (0, 1). + - `min_samples` *(Default value = `1`)* + + The minimum number of examples to be included in a sample. The given value must be at least 1. + + - `max_samples` *(Default value = `0`)* + + The maximum number of examples to be included in a sample. The given value must be at least the value of `min_samples` or 0, if the number of examples should not be restricted. + `'without-replacement'` The training examples to be considered for learning a new rule are selected randomly without replacement. The following options may be provided using the {ref}`bracket notation`: @@ -386,6 +402,14 @@ The seed to be used by random number generators. The given value must be at leas The percentage of examples to be included in a sample. For example, a value of 0.6 corresponds to 60% of the available examples. The given value must be in the range (0, 1). + - `min_samples` *(Default value = `1`)* + + The minimum number of examples to be included in a sample. The given value must be at least 1. + + - `max_samples` *(Default value = `0`)* + + The maximum number of examples to be included in a sample. The given value must be at least the value of `min_samples` or 0, if the number of examples should not be restricted. + `'stratified-output-wise'` The training examples to be considered for learning a new rule are selected according to an iterative stratified sampling method that ensures that for each label the proportion of relevant and irrelevant examples is maintained. The following options may be provided using the {ref}`bracket notation`: @@ -393,12 +417,28 @@ The seed to be used by random number generators. The given value must be at leas The percentage of examples to be included in a sample. For example, a value of 0.6 corresponds to 60% of the available examples. The given value must be in the range (0, 1). + - `min_samples` *(Default value = `1`)* + + The minimum number of examples to be included in a sample. The given value must be at least 1. + + - `max_samples` *(Default value = `0`)* + + The maximum number of examples to be included in a sample. The given value must be at least the value of `min_samples` or 0, if the number of examples should not be restricted. + `'stratified-example-wise'` The training examples to be considered for learning a new rule are selected according to stratified sampling method, where distinct label vectors are treated as individual classes. The following options may be provided using the {ref}`bracket notation`: - `sample_size` *(Default value = `0.66`)* The percentage of examples to be included in a sample. For example, a value of 0.6 corresponds to 60% of the available examples. The given value must be in the range (0, 1). + + - `min_samples` *(Default value = `1`)* + + The minimum number of examples to be included in a sample. The given value must be at least 1. + + - `max_samples` *(Default value = `0`)* + + The maximum number of examples to be included in a sample. The given value must be at least the value of `min_samples` or 0, if the number of examples should not be restricted. ``` ## Approximations and Optimizations diff --git a/python/subprojects/common/mlrl/common/config.py b/python/subprojects/common/mlrl/common/config.py index af9581d86..9f47f8438 100644 --- a/python/subprojects/common/mlrl/common/config.py +++ b/python/subprojects/common/mlrl/common/config.py @@ -42,6 +42,10 @@ OPTION_SAMPLE_SIZE = 'sample_size' +OPTION_MIN_SAMPLES = 'min_samples' + +OPTION_MAX_SAMPLES = 'max_samples' + OPTION_NUM_SAMPLES = 'num_samples' BINNING_EQUAL_FREQUENCY = 'equal-frequency' @@ -418,16 +422,16 @@ def __init__(self): self.add_value(name=NONE, mixin=NoInstanceSamplingMixin) self.add_value(name=SAMPLING_WITH_REPLACEMENT, mixin=InstanceSamplingWithReplacementMixin, - options={OPTION_SAMPLE_SIZE}) + options={OPTION_SAMPLE_SIZE, OPTION_MIN_SAMPLES, OPTION_MAX_SAMPLES}) self.add_value(name=SAMPLING_WITHOUT_REPLACEMENT, mixin=InstanceSamplingWithoutReplacementMixin, - options={OPTION_SAMPLE_SIZE}) + options={OPTION_SAMPLE_SIZE, OPTION_MIN_SAMPLES, OPTION_MAX_SAMPLES}) self.add_value(name=SAMPLING_STRATIFIED_OUTPUT_WISE, mixin=OutputWiseStratifiedInstanceSamplingMixin, - options={OPTION_SAMPLE_SIZE}) + options={OPTION_SAMPLE_SIZE, OPTION_MIN_SAMPLES, OPTION_MAX_SAMPLES}) self.add_value(name=SAMPLING_STRATIFIED_EXAMPLE_WISE, mixin=ExampleWiseStratifiedInstanceSamplingMixin, - options={OPTION_SAMPLE_SIZE}) + options={OPTION_SAMPLE_SIZE, OPTION_MIN_SAMPLES, OPTION_MAX_SAMPLES}) def _configure(self, config, value: str, options: Optional[Options]): if value == NONE: @@ -435,15 +439,23 @@ def _configure(self, config, value: str, options: Optional[Options]): elif value == SAMPLING_WITH_REPLACEMENT: conf = config.use_instance_sampling_with_replacement() conf.set_sample_size(options.get_float(OPTION_SAMPLE_SIZE, conf.get_sample_size())) + conf.set_min_samples(options.get_int(OPTION_MIN_SAMPLES, conf.get_min_samples())) + conf.set_max_samples(options.get_int(OPTION_MAX_SAMPLES, conf.get_max_samples())) elif value == SAMPLING_WITHOUT_REPLACEMENT: conf = config.use_instance_sampling_without_replacement() conf.set_sample_size(options.get_float(OPTION_SAMPLE_SIZE, conf.get_sample_size())) + conf.set_min_samples(options.get_int(OPTION_MIN_SAMPLES, conf.get_min_samples())) + conf.set_max_samples(options.get_int(OPTION_MAX_SAMPLES, conf.get_max_samples())) elif value == SAMPLING_STRATIFIED_OUTPUT_WISE: conf = config.use_output_wise_stratified_instance_sampling() conf.set_sample_size(options.get_float(OPTION_SAMPLE_SIZE, conf.get_sample_size())) + conf.set_min_samples(options.get_int(OPTION_MIN_SAMPLES, conf.get_min_samples())) + conf.set_max_samples(options.get_int(OPTION_MAX_SAMPLES, conf.get_max_samples())) elif value == SAMPLING_STRATIFIED_EXAMPLE_WISE: conf = config.use_example_wise_stratified_instance_sampling() conf.set_sample_size(options.get_float(OPTION_SAMPLE_SIZE, conf.get_sample_size())) + conf.set_min_samples(options.get_int(OPTION_MIN_SAMPLES, conf.get_min_samples())) + conf.set_max_samples(options.get_int(OPTION_MAX_SAMPLES, conf.get_max_samples())) class FeatureSamplingParameter(NominalParameter): @@ -459,7 +471,7 @@ def __init__(self): self.add_value(name=NONE, mixin=NoFeatureSamplingMixin) self.add_value(name=SAMPLING_WITHOUT_REPLACEMENT, mixin=FeatureSamplingWithoutReplacementMixin, - options={OPTION_SAMPLE_SIZE, self.OPTION_NUM_RETAINED}) + options={OPTION_SAMPLE_SIZE, OPTION_MIN_SAMPLES, OPTION_MAX_SAMPLES, self.OPTION_NUM_RETAINED}) def _configure(self, config, value: str, options: Optional[Options]): if value == NONE: @@ -467,6 +479,8 @@ def _configure(self, config, value: str, options: Optional[Options]): elif value == SAMPLING_WITHOUT_REPLACEMENT: conf = config.use_feature_sampling_without_replacement() conf.set_sample_size(options.get_float(OPTION_SAMPLE_SIZE, conf.get_sample_size())) + conf.set_min_samples(options.get_int(OPTION_MIN_SAMPLES, conf.get_min_samples())) + conf.set_max_samples(options.get_int(OPTION_MAX_SAMPLES, conf.get_max_samples())) conf.set_num_retained(options.get_int(self.OPTION_NUM_RETAINED, conf.get_num_retained())) diff --git a/python/subprojects/common/mlrl/common/cython/feature_sampling.pxd b/python/subprojects/common/mlrl/common/cython/feature_sampling.pxd index 1d57ef1b9..043242f69 100644 --- a/python/subprojects/common/mlrl/common/cython/feature_sampling.pxd +++ b/python/subprojects/common/mlrl/common/cython/feature_sampling.pxd @@ -11,6 +11,14 @@ cdef extern from "mlrl/common/sampling/feature_sampling_without_replacement.hpp" IFeatureSamplingWithoutReplacementConfig& setSampleSize(float32 sampleSize) except + + uint32 getMinSamples() const + + IFeatureSamplingWithoutReplacementConfig& setMinSamples(uint32 minSamples) except + + + uint32 getMaxSamples() const + + IFeatureSamplingWithoutReplacementConfig& setMaxSamples(uint32 maxSamples) except + + uint32 getNumRetained() const IFeatureSamplingWithoutReplacementConfig& setNumRetained(uint32 numRetained) except + diff --git a/python/subprojects/common/mlrl/common/cython/feature_sampling.pyx b/python/subprojects/common/mlrl/common/cython/feature_sampling.pyx index 576518b54..b8a2501e2 100644 --- a/python/subprojects/common/mlrl/common/cython/feature_sampling.pyx +++ b/python/subprojects/common/mlrl/common/cython/feature_sampling.pyx @@ -32,6 +32,48 @@ cdef class FeatureSamplingWithoutReplacementConfig: self.config_ptr.setSampleSize(sample_size) return self + def get_min_samples(self) -> int: + """ + Returns the minimum number of features that are included in a sample. + + :return: The minimum number of features that are included in a sample + """ + return self.config_ptr.getMinSamples() + + def set_min_samples(self, min_samples: int) -> FeatureSamplingWithoutReplacementConfig: + """ + Sets the minimum number of features that should be included in a sample. + + :param min_samples: The minimum number of features that should be included in a sample. Must be at least 1 + :return: A `FeatureSamplingWithoutReplacementConfig` that allows further configuration of the method + for sampling features + """ + assert_greater_or_equal('min_samples', min_samples, 1) + self.config_ptr.setMinSamples(min_samples) + return self + + def get_max_samples(self) -> int: + """ + Returns the maximum number of features that are included in a sample. + + :return: The maximum number of features that are included in a sample + """ + return self.config_ptr.getMaxSamples() + + def set_max_samples(self, max_samples: int) -> FeatureSamplingWithoutReplacementConfig: + """ + Sets the maximum number of features that should be included in a sample. + + :param max_samples: The maximum number of features that should be included in a sample. Must be at least + `get_min_samples()` or 0, if the number of features should not be restricted + :return: A `FeatureSamplingWithoutReplacementConfig` that allows further configuration of the method + for sampling features + """ + if max_samples != 0: + assert_greater_or_equal('max_samples', max_samples, self.get_min_samples()) + self.config_ptr.setMaxSamples(max_samples) + return self + def get_num_retained(self) -> int: """ Returns the number of trailing features that are always included in a sample. diff --git a/python/subprojects/common/mlrl/common/cython/instance_sampling.pxd b/python/subprojects/common/mlrl/common/cython/instance_sampling.pxd index 14c1a2820..bc50e8167 100644 --- a/python/subprojects/common/mlrl/common/cython/instance_sampling.pxd +++ b/python/subprojects/common/mlrl/common/cython/instance_sampling.pxd @@ -1,4 +1,4 @@ -from mlrl.common.cython._types cimport float32 +from mlrl.common.cython._types cimport float32, uint32 cdef extern from "mlrl/common/sampling/instance_sampling_stratified_example_wise.hpp" nogil: @@ -11,6 +11,14 @@ cdef extern from "mlrl/common/sampling/instance_sampling_stratified_example_wise IExampleWiseStratifiedInstanceSamplingConfig& setSampleSize(float32 sampleSize) + uint32 getMinSamples() const + + IExampleWiseStratifiedInstanceSamplingConfig& setMinSamples(float32 minSamples) + + uint32 getMaxSamples() const + + IExampleWiseStratifiedInstanceSamplingConfig& setMaxSamples(float32 maxSamples) + cdef extern from "mlrl/common/sampling/instance_sampling_stratified_output_wise.hpp" nogil: @@ -22,6 +30,14 @@ cdef extern from "mlrl/common/sampling/instance_sampling_stratified_output_wise. IOutputWiseStratifiedInstanceSamplingConfig& setSampleSize(float32 sampleSize) + uint32 getMinSamples() const + + IExampleWiseStratifiedInstanceSamplingConfig& setMinSamples(float32 minSamples) + + uint32 getMaxSamples() const + + IExampleWiseStratifiedInstanceSamplingConfig& setMaxSamples(float32 maxSamples) + cdef extern from "mlrl/common/sampling/instance_sampling_with_replacement.hpp" nogil: @@ -33,6 +49,14 @@ cdef extern from "mlrl/common/sampling/instance_sampling_with_replacement.hpp" n IInstanceSamplingWithReplacementConfig& setSampleSize(float32 sampleSize) + uint32 getMinSamples() const + + IExampleWiseStratifiedInstanceSamplingConfig& setMinSamples(float32 minSamples) + + uint32 getMaxSamples() const + + IExampleWiseStratifiedInstanceSamplingConfig& setMaxSamples(float32 maxSamples) + cdef extern from "mlrl/common/sampling/instance_sampling_without_replacement.hpp" nogil: @@ -44,6 +68,14 @@ cdef extern from "mlrl/common/sampling/instance_sampling_without_replacement.hpp IInstanceSamplingWithoutReplacementConfig& setSampleSize(float32 sampleSize) + uint32 getMinSamples() const + + IExampleWiseStratifiedInstanceSamplingConfig& setMinSamples(float32 minSamples) + + uint32 getMaxSamples() const + + IExampleWiseStratifiedInstanceSamplingConfig& setMaxSamples(float32 maxSamples) + cdef class ExampleWiseStratifiedInstanceSamplingConfig: diff --git a/python/subprojects/common/mlrl/common/cython/instance_sampling.pyx b/python/subprojects/common/mlrl/common/cython/instance_sampling.pyx index 3085b4366..b7c0ae1b0 100644 --- a/python/subprojects/common/mlrl/common/cython/instance_sampling.pyx +++ b/python/subprojects/common/mlrl/common/cython/instance_sampling.pyx @@ -1,7 +1,7 @@ """ @author: Michael Rapp (michael.rapp.ml@gmail.com) """ -from mlrl.common.cython.validation import assert_greater, assert_less +from mlrl.common.cython.validation import assert_greater, assert_greater_or_equal, assert_less cdef class ExampleWiseStratifiedInstanceSamplingConfig: @@ -32,6 +32,48 @@ cdef class ExampleWiseStratifiedInstanceSamplingConfig: self.config_ptr.setSampleSize(sample_size) return self + def get_min_samples(self) -> int: + """ + Returns the minimum number of examples that are included in a sample. + + :return: The minimum number of examples that are included in a sample + """ + return self.config_ptr.getMinSamples() + + def set_min_samples(self, min_samples: int) -> ExampleWiseStratifiedInstanceSamplingConfig: + """ + Sets the minimum number of examples that should be included in a sample. + + :param min_samples: The minimum number of examples that should be included in a sample. Must be at least 1 + :return: An `ExampleWiseStratifiedInstanceSamplingConfig` that allows further configuration of the + method for sampling instances + """ + assert_greater_or_equal('min_samples', min_samples, 1) + self.config_ptr.setMinSamples(min_samples) + return self + + def get_max_samples(self) -> int: + """ + Returns the maximum number of examples that are included in a sample. + + :return: The maximum number of examples that are included in a sample + """ + return self.config_ptr.getMaxSamples() + + def set_max_samples(self, max_samples: int) -> ExampleWiseStratifiedInstanceSamplingConfig: + """ + Sets the maximum number of examples that should be included in a sample. + + :param max_samples: The maximum number of examples that should be included in a sample. Must be at least + `get_min_samples()` or 0, if the number of examples should not be restricted + :return: An `ExampleWiseStratifiedInstanceSamplingConfig` that allows further configuration of the + method for sampling instances + """ + if max_samples != 0: + assert_greater_or_equal('max_samples', max_samples, self.get_min_samples()) + self.config_ptr.setMaxSamples(max_samples) + return self + cdef class OutputWiseStratifiedInstanceSamplingConfig: """ @@ -61,6 +103,48 @@ cdef class OutputWiseStratifiedInstanceSamplingConfig: self.config_ptr.setSampleSize(sample_size) return self + def get_min_samples(self) -> int: + """ + Returns the minimum number of examples that are included in a sample. + + :return: The minimum number of examples that are included in a sample + """ + return self.config_ptr.getMinSamples() + + def set_min_samples(self, min_samples: int) -> OutputWiseStratifiedInstanceSamplingConfig: + """ + Sets the minimum number of examples that should be included in a sample. + + :param min_samples: The minimum number of examples that should be included in a sample. Must be at least 1 + :return: An `OutputWiseStratifiedInstanceSamplingConfig` that allows further configuration of the + method for sampling instances + """ + assert_greater_or_equal('min_samples', min_samples, 1) + self.config_ptr.setMinSamples(min_samples) + return self + + def get_max_samples(self) -> int: + """ + Returns the maximum number of examples that are included in a sample. + + :return: The maximum number of examples that are included in a sample + """ + return self.config_ptr.getMaxSamples() + + def set_max_samples(self, max_samples: int) -> OutputWiseStratifiedInstanceSamplingConfig: + """ + Sets the maximum number of examples that should be included in a sample. + + :param max_samples: The maximum number of examples that should be included in a sample. Must be at least + `get_min_samples()` or 0, if the number of examples should not be restricted + :return: An `OutputWiseStratifiedInstanceSamplingConfig` that allows further configuration of the + method for sampling instances + """ + if max_samples != 0: + assert_greater_or_equal('max_samples', max_samples, self.get_min_samples()) + self.config_ptr.setMaxSamples(max_samples) + return self + cdef class InstanceSamplingWithReplacementConfig: """ @@ -89,6 +173,48 @@ cdef class InstanceSamplingWithReplacementConfig: self.config_ptr.setSampleSize(sample_size) return self + def get_min_samples(self) -> int: + """ + Returns the minimum number of examples that are included in a sample. + + :return: The minimum number of examples that are included in a sample + """ + return self.config_ptr.getMinSamples() + + def set_min_samples(self, min_samples: int) -> InstanceSamplingWithReplacementConfig: + """ + Sets the minimum number of examples that should be included in a sample. + + :param min_samples: The minimum number of examples that should be included in a sample. Must be at least 1 + :return: An `InstanceSamplingWithReplacementConfig` that allows further configuration of the method + for sampling instances + """ + assert_greater_or_equal('min_samples', min_samples, 1) + self.config_ptr.setMinSamples(min_samples) + return self + + def get_max_samples(self) -> int: + """ + Returns the maximum number of examples that are included in a sample. + + :return: The maximum number of examples that are included in a sample + """ + return self.config_ptr.getMaxSamples() + + def set_max_samples(self, max_samples: int) -> InstanceSamplingWithReplacementConfig: + """ + Sets the maximum number of examples that should be included in a sample. + + :param max_samples: The maximum number of examples that should be included in a sample. Must be at least + `get_min_samples()` or 0, if the number of examples should not be restricted + :return: An `InstanceSamplingWithReplacementConfig` that allows further configuration of the method + for sampling instances + """ + if max_samples != 0: + assert_greater_or_equal('max_samples', max_samples, self.get_min_samples()) + self.config_ptr.setMaxSamples(max_samples) + return self + cdef class InstanceSamplingWithoutReplacementConfig: """ @@ -116,3 +242,45 @@ cdef class InstanceSamplingWithoutReplacementConfig: assert_less('sample_size', sample_size, 1) self.config_ptr.setSampleSize(sample_size) return self + + def get_min_samples(self) -> int: + """ + Returns the minimum number of examples that are included in a sample. + + :return: The minimum number of examples that are included in a sample + """ + return self.config_ptr.getMinSamples() + + def set_min_samples(self, min_samples: int) -> InstanceSamplingWithoutReplacementConfig: + """ + Sets the minimum number of examples that should be included in a sample. + + :param min_samples: The minimum number of examples that should be included in a sample. Must be at least 1 + :return: An `InstanceSamplingWithReplacementConfig` that allows further configuration of the method + for sampling instances + """ + assert_greater_or_equal('min_samples', min_samples, 1) + self.config_ptr.setMinSamples(min_samples) + return self + + def get_max_samples(self) -> int: + """ + Returns the maximum number of examples that are included in a sample. + + :return: The maximum number of examples that are included in a sample + """ + return self.config_ptr.getMaxSamples() + + def set_max_samples(self, max_samples: int) -> InstanceSamplingWithoutReplacementConfig: + """ + Sets the maximum number of examples that should be included in a sample. + + :param max_samples: The maximum number of examples that should be included in a sample. Must be at least + `get_min_samples()` or 0, if the number of examples should not be restricted + :return: An `InstanceSamplingWithReplacementConfig` that allows further configuration of the method + for sampling instances + """ + if max_samples != 0: + assert_greater_or_equal('max_samples', max_samples, self.get_min_samples()) + self.config_ptr.setMaxSamples(max_samples) + return self diff --git a/python/subprojects/testbed/tests/res/out/boomer-classifier/instance-sampling-with-replacement.txt b/python/subprojects/testbed/tests/res/out/boomer-classifier/instance-sampling-with-replacement.txt index 793f74775..68b7de2ca 100644 --- a/python/subprojects/testbed/tests/res/out/boomer-classifier/instance-sampling-with-replacement.txt +++ b/python/subprojects/testbed/tests/res/out/boomer-classifier/instance-sampling-with-replacement.txt @@ -12,21 +12,21 @@ DEBUG A dense matrix is used to store the predicted labels INFO Successfully predicted in INFO Evaluation result for test data: -Example-wise F1 60.43 -Example-wise Jaccard 53.15 -Example-wise Precision 78.91 -Example-wise Recall 59.95 -Hamming Accuracy 81.46 -Hamming Loss 18.54 -Macro F1 65.75 -Macro Jaccard 49.93 -Macro Precision 77.16 -Macro Recall 59.52 -Micro F1 67.56 -Micro Jaccard 51.01 -Micro Precision 76.43 -Micro Recall 60.53 -Subset 0/1 Loss 70.41 -Subset Accuracy 29.59 +Example-wise F1 60.71 +Example-wise Jaccard 52.98 +Example-wise Precision 77.3 +Example-wise Recall 61.22 +Hamming Accuracy 80.87 +Hamming Loss 19.13 +Macro F1 64.82 +Macro Jaccard 49.09 +Macro Precision 73.78 +Macro Recall 59.67 +Micro F1 67.06 +Micro Jaccard 50.44 +Micro Precision 74.35 +Micro Recall 61.07 +Subset 0/1 Loss 71.94 +Subset Accuracy 28.06 INFO Successfully finished after diff --git a/python/subprojects/testbed/tests/res/out/boomer-classifier/instance-sampling-without-replacement.txt b/python/subprojects/testbed/tests/res/out/boomer-classifier/instance-sampling-without-replacement.txt index 9fef912e3..0e5d9d4b9 100644 --- a/python/subprojects/testbed/tests/res/out/boomer-classifier/instance-sampling-without-replacement.txt +++ b/python/subprojects/testbed/tests/res/out/boomer-classifier/instance-sampling-without-replacement.txt @@ -12,21 +12,21 @@ DEBUG A dense matrix is used to store the predicted labels INFO Successfully predicted in INFO Evaluation result for test data: -Example-wise F1 63.28 -Example-wise Jaccard 55.4 -Example-wise Precision 79.42 -Example-wise Recall 63.61 -Hamming Accuracy 82.4 -Hamming Loss 17.6 -Macro F1 67.46 -Macro Jaccard 52.03 -Macro Precision 77.09 -Macro Recall 62.01 -Micro F1 69.78 -Micro Jaccard 53.59 -Micro Precision 77.1 -Micro Recall 63.73 -Subset 0/1 Loss 70.41 -Subset Accuracy 29.59 +Example-wise F1 61.82 +Example-wise Jaccard 54.42 +Example-wise Precision 79.59 +Example-wise Recall 61.14 +Hamming Accuracy 81.63 +Hamming Loss 18.37 +Macro F1 66.4 +Macro Jaccard 50.36 +Macro Precision 77.91 +Macro Recall 59.54 +Micro F1 67.66 +Micro Jaccard 51.13 +Micro Precision 77.13 +Micro Recall 60.27 +Subset 0/1 Loss 69.9 +Subset Accuracy 30.1 INFO Successfully finished after diff --git a/python/subprojects/testbed/tests/res/out/boomer-classifier/pruning-irep.txt b/python/subprojects/testbed/tests/res/out/boomer-classifier/pruning-irep.txt index ce04f271e..2fefd3804 100644 --- a/python/subprojects/testbed/tests/res/out/boomer-classifier/pruning-irep.txt +++ b/python/subprojects/testbed/tests/res/out/boomer-classifier/pruning-irep.txt @@ -12,21 +12,21 @@ DEBUG A dense matrix is used to store the predicted labels INFO Successfully predicted in INFO Evaluation result for test data: -Example-wise F1 61.11 -Example-wise Jaccard 53.19 -Example-wise Precision 78.66 -Example-wise Recall 61.14 -Hamming Accuracy 80.95 -Hamming Loss 19.05 -Macro F1 65.62 -Macro Jaccard 49.17 -Macro Precision 75.6 -Macro Recall 58.7 -Micro F1 66.77 -Micro Jaccard 50.11 -Micro Precision 75.25 -Micro Recall 60 -Subset 0/1 Loss 71.94 -Subset Accuracy 28.06 +Example-wise F1 60.26 +Example-wise Jaccard 52.21 +Example-wise Precision 79.17 +Example-wise Recall 60.2 +Hamming Accuracy 80.61 +Hamming Loss 19.39 +Macro F1 64.23 +Macro Jaccard 48.09 +Macro Precision 74.63 +Macro Recall 57.38 +Micro F1 65.77 +Micro Jaccard 48.99 +Micro Precision 75.26 +Micro Recall 58.4 +Subset 0/1 Loss 73.98 +Subset Accuracy 26.02 INFO Successfully finished after diff --git a/python/subprojects/testbed/tests/res/out/boomer-regressor/instance-sampling-with-replacement.txt b/python/subprojects/testbed/tests/res/out/boomer-regressor/instance-sampling-with-replacement.txt index f541d2c73..f5d4673dc 100644 --- a/python/subprojects/testbed/tests/res/out/boomer-regressor/instance-sampling-with-replacement.txt +++ b/python/subprojects/testbed/tests/res/out/boomer-regressor/instance-sampling-with-replacement.txt @@ -12,9 +12,9 @@ DEBUG A dense matrix is used to store the predicted scores INFO Successfully predicted in INFO Evaluation result for test data: -Mean Absolute Error 31.23 -Mean Absolute Percentage Error 1.87 -Mean Squared Error 2088.96 -Median Absolute Error 19.05 +Mean Absolute Error 32.72 +Mean Absolute Percentage Error 2.22 +Mean Squared Error 2226.47 +Median Absolute Error 21.27 INFO Successfully finished after diff --git a/python/subprojects/testbed/tests/res/out/boomer-regressor/instance-sampling-without-replacement.txt b/python/subprojects/testbed/tests/res/out/boomer-regressor/instance-sampling-without-replacement.txt index db363f807..787c36769 100644 --- a/python/subprojects/testbed/tests/res/out/boomer-regressor/instance-sampling-without-replacement.txt +++ b/python/subprojects/testbed/tests/res/out/boomer-regressor/instance-sampling-without-replacement.txt @@ -12,9 +12,9 @@ DEBUG A dense matrix is used to store the predicted scores INFO Successfully predicted in INFO Evaluation result for test data: -Mean Absolute Error 29.87 -Mean Absolute Percentage Error 1.72 -Mean Squared Error 1919.64 -Median Absolute Error 20.32 +Mean Absolute Error 31.83 +Mean Absolute Percentage Error 2.01 +Mean Squared Error 2195.87 +Median Absolute Error 21.22 INFO Successfully finished after diff --git a/python/subprojects/testbed/tests/res/out/boomer-regressor/pruning-irep.txt b/python/subprojects/testbed/tests/res/out/boomer-regressor/pruning-irep.txt index 2d99d84dd..f0446ae92 100644 --- a/python/subprojects/testbed/tests/res/out/boomer-regressor/pruning-irep.txt +++ b/python/subprojects/testbed/tests/res/out/boomer-regressor/pruning-irep.txt @@ -12,9 +12,9 @@ DEBUG A dense matrix is used to store the predicted scores INFO Successfully predicted in INFO Evaluation result for test data: -Mean Absolute Error 32.74 -Mean Absolute Percentage Error 1.69 -Mean Squared Error 2245.15 -Median Absolute Error 19.41 +Mean Absolute Error 33.95 +Mean Absolute Percentage Error 1.95 +Mean Squared Error 2435.84 +Median Absolute Error 21.9 INFO Successfully finished after diff --git a/python/subprojects/testbed/tests/res/out/seco-classifier/instance-sampling-with-replacement.txt b/python/subprojects/testbed/tests/res/out/seco-classifier/instance-sampling-with-replacement.txt index 7a14f893a..7a1f01437 100644 --- a/python/subprojects/testbed/tests/res/out/seco-classifier/instance-sampling-with-replacement.txt +++ b/python/subprojects/testbed/tests/res/out/seco-classifier/instance-sampling-with-replacement.txt @@ -12,21 +12,21 @@ DEBUG A dense matrix is used to store the predicted labels INFO Successfully predicted in INFO Evaluation result for test data: -Example-wise F1 53.78 -Example-wise Jaccard 43.94 -Example-wise Precision 57.22 -Example-wise Recall 60.54 -Hamming Accuracy 71.34 -Hamming Loss 28.66 -Macro F1 49.31 -Macro Jaccard 35.42 -Macro Precision 67.16 -Macro Recall 57.9 -Micro F1 57.07 -Micro Jaccard 39.93 -Micro Precision 54.63 -Micro Recall 59.73 -Subset 0/1 Loss 83.16 -Subset Accuracy 16.84 +Example-wise F1 56.48 +Example-wise Jaccard 47.83 +Example-wise Precision 62.63 +Example-wise Recall 61.56 +Hamming Accuracy 73.98 +Hamming Loss 26.02 +Macro F1 52.68 +Macro Jaccard 38.67 +Macro Precision 67.57 +Macro Recall 58.19 +Micro F1 59.74 +Micro Jaccard 42.59 +Micro Precision 58.96 +Micro Recall 60.53 +Subset 0/1 Loss 77.55 +Subset Accuracy 22.45 INFO Successfully finished after diff --git a/python/subprojects/testbed/tests/res/out/seco-classifier/instance-sampling-without-replacement.txt b/python/subprojects/testbed/tests/res/out/seco-classifier/instance-sampling-without-replacement.txt index 8d7863131..fd2079a4a 100644 --- a/python/subprojects/testbed/tests/res/out/seco-classifier/instance-sampling-without-replacement.txt +++ b/python/subprojects/testbed/tests/res/out/seco-classifier/instance-sampling-without-replacement.txt @@ -12,20 +12,20 @@ DEBUG A dense matrix is used to store the predicted labels INFO Successfully predicted in INFO Evaluation result for test data: -Example-wise F1 52.94 -Example-wise Jaccard 44.28 -Example-wise Precision 63.36 -Example-wise Recall 58.76 -Hamming Accuracy 72.87 -Hamming Loss 27.13 -Macro F1 51.37 -Macro Jaccard 37.24 -Macro Precision 66.09 -Macro Recall 56.01 -Micro F1 57.41 -Micro Jaccard 40.26 -Micro Precision 57.49 -Micro Recall 57.33 +Example-wise F1 54.21 +Example-wise Jaccard 45.35 +Example-wise Precision 62.47 +Example-wise Recall 58.84 +Hamming Accuracy 73.81 +Hamming Loss 26.19 +Macro F1 52.18 +Macro Jaccard 38.1 +Macro Precision 66.87 +Macro Recall 56.51 +Micro F1 58.6 +Micro Jaccard 41.44 +Micro Precision 59.08 +Micro Recall 58.13 Subset 0/1 Loss 80.61 Subset Accuracy 19.39 diff --git a/python/subprojects/testbed/tests/res/out/seco-classifier/pruning-irep.txt b/python/subprojects/testbed/tests/res/out/seco-classifier/pruning-irep.txt index 8d7863131..fd2079a4a 100644 --- a/python/subprojects/testbed/tests/res/out/seco-classifier/pruning-irep.txt +++ b/python/subprojects/testbed/tests/res/out/seco-classifier/pruning-irep.txt @@ -12,20 +12,20 @@ DEBUG A dense matrix is used to store the predicted labels INFO Successfully predicted in INFO Evaluation result for test data: -Example-wise F1 52.94 -Example-wise Jaccard 44.28 -Example-wise Precision 63.36 -Example-wise Recall 58.76 -Hamming Accuracy 72.87 -Hamming Loss 27.13 -Macro F1 51.37 -Macro Jaccard 37.24 -Macro Precision 66.09 -Macro Recall 56.01 -Micro F1 57.41 -Micro Jaccard 40.26 -Micro Precision 57.49 -Micro Recall 57.33 +Example-wise F1 54.21 +Example-wise Jaccard 45.35 +Example-wise Precision 62.47 +Example-wise Recall 58.84 +Hamming Accuracy 73.81 +Hamming Loss 26.19 +Macro F1 52.18 +Macro Jaccard 38.1 +Macro Precision 66.87 +Macro Recall 56.51 +Micro F1 58.6 +Micro Jaccard 41.44 +Micro Precision 59.08 +Micro Recall 58.13 Subset 0/1 Loss 80.61 Subset Accuracy 19.39