From e917a3dde24e0c0e739abf1a433f6821531b9a1b Mon Sep 17 00:00:00 2001 From: Michael Rapp Date: Tue, 1 Oct 2024 23:50:33 +0200 Subject: [PATCH 01/12] Add functions "getMinSampleSize", "setMinSampleSize", "getMaxSampleSize" and "setMaxSampleSize" to class IFeatureSamplingWithoutReplacementConfig. --- .../feature_sampling_without_replacement.hpp | 45 +++++++++++++++++++ .../feature_sampling_without_replacement.cpp | 33 ++++++++++++-- .../mlrl/common/cython/feature_sampling.pxd | 8 ++++ .../mlrl/common/cython/feature_sampling.pyx | 42 +++++++++++++++++ 4 files changed, 125 insertions(+), 3 deletions(-) diff --git a/cpp/subprojects/common/include/mlrl/common/sampling/feature_sampling_without_replacement.hpp b/cpp/subprojects/common/include/mlrl/common/sampling/feature_sampling_without_replacement.hpp index af1dc94a55..a2b22863af 100644 --- a/cpp/subprojects/common/include/mlrl/common/sampling/feature_sampling_without_replacement.hpp +++ b/cpp/subprojects/common/include/mlrl/common/sampling/feature_sampling_without_replacement.hpp @@ -34,6 +34,39 @@ class MLRLCOMMON_API IFeatureSamplingWithoutReplacementConfig { */ virtual IFeatureSamplingWithoutReplacementConfig& setSampleSize(float32 sampleSize) = 0; + /** + * Returns the minimum number of features that are included in a sample. + * + * @return The minimum number of features that are included in a sample + */ + virtual uint32 getMinSamples() const = 0; + + /** + * Sets the minimum number of features that should be included in a sample. + * + * @param minSamples The minimum number of features that should be included in a sample. Must be at least 1 + * @return A reference to an object of type `IFeatureSamplingWithoutReplacementConfig` that allows + * further configuration of the method for sampling features + */ + virtual IFeatureSamplingWithoutReplacementConfig& setMinSamples(uint32 minSamples) = 0; + + /** + * Returns the maximum number of features that are included in a sample. + * + * @return The maximum number of features that are included in a sample + */ + virtual uint32 getMaxSamples() const = 0; + + /** + * Sets the maximum number of features that should be included in a sample. + * + * @param maxSamples The maximum number of features that should be included in a sample. Must be at the value + * returned by `getMaxSamples` or 0, if the number of features should not be restricted + * @return A reference to an object of type `IFeatureSamplingWithoutReplacementConfig` that allows + * further configuration of the method for sampling features + */ + virtual IFeatureSamplingWithoutReplacementConfig& setMaxSamples(uint32 maxSamples) = 0; + /** * Returns the number of trailing features that are always included in a sample. * @@ -63,6 +96,10 @@ class FeatureSamplingWithoutReplacementConfig final : public IFeatureSamplingCon float32 sampleSize_; + uint32 minSamples_; + + uint32 maxSamples_; + uint32 numRetained_; public: @@ -77,6 +114,14 @@ class FeatureSamplingWithoutReplacementConfig final : public IFeatureSamplingCon IFeatureSamplingWithoutReplacementConfig& setSampleSize(float32 sampleSize) override; + uint32 getMinSamples() const override; + + IFeatureSamplingWithoutReplacementConfig& setMinSamples(uint32 minSamples) override; + + uint32 getMaxSamples() const override; + + IFeatureSamplingWithoutReplacementConfig& setMaxSamples(uint32 maxSamples) override; + uint32 getNumRetained() const override; IFeatureSamplingWithoutReplacementConfig& setNumRetained(uint32 numRetained) override; diff --git a/cpp/subprojects/common/src/mlrl/common/sampling/feature_sampling_without_replacement.cpp b/cpp/subprojects/common/src/mlrl/common/sampling/feature_sampling_without_replacement.cpp index cbbb8bd655..f48d904f4c 100644 --- a/cpp/subprojects/common/src/mlrl/common/sampling/feature_sampling_without_replacement.cpp +++ b/cpp/subprojects/common/src/mlrl/common/sampling/feature_sampling_without_replacement.cpp @@ -4,6 +4,7 @@ #include "mlrl/common/indices/index_vector_partial.hpp" #include "mlrl/common/iterator/iterator_index.hpp" #include "mlrl/common/sampling/feature_sampling_predefined.hpp" +#include "mlrl/common/util/math.hpp" #include "mlrl/common/util/validation.hpp" /** @@ -98,7 +99,7 @@ class FeatureSamplingWithoutReplacementFactory final : public IFeatureSamplingFa }; FeatureSamplingWithoutReplacementConfig::FeatureSamplingWithoutReplacementConfig(ReadableProperty rngConfig) - : rngConfig_(rngConfig), sampleSize_(0), numRetained_(0) {} + : rngConfig_(rngConfig), sampleSize_(0), minSamples_(1), maxSamples_(0), numRetained_(0) {} float32 FeatureSamplingWithoutReplacementConfig::getSampleSize() const { return sampleSize_; @@ -111,6 +112,26 @@ IFeatureSamplingWithoutReplacementConfig& FeatureSamplingWithoutReplacementConfi return *this; } +uint32 FeatureSamplingWithoutReplacementConfig::getMinSamples() const { + return minSamples_; +} + +IFeatureSamplingWithoutReplacementConfig& FeatureSamplingWithoutReplacementConfig::setMinSamples(uint32 minSamples) { + util::assertGreaterOrEqual("minSamples", minSamples, 1); + minSamples_ = minSamples; + return *this; +} + +uint32 FeatureSamplingWithoutReplacementConfig::getMaxSamples() const { + return maxSamples_; +} + +IFeatureSamplingWithoutReplacementConfig& FeatureSamplingWithoutReplacementConfig::setMaxSamples(uint32 maxSamples) { + if (maxSamples != 0) util::assertGreaterOrEqual("maxSamples", maxSamples, minSamples_); + maxSamples_ = maxSamples; + return *this; +} + uint32 FeatureSamplingWithoutReplacementConfig::getNumRetained() const { return numRetained_; } @@ -126,8 +147,14 @@ std::unique_ptr FeatureSamplingWithoutReplacementConfig uint32 numFeatures = featureMatrix.getNumFeatures(); uint32 numRetained = std::min(numRetained_, numFeatures); uint32 numRemainingFeatures = numFeatures - numRetained; - uint32 numSamples = - static_cast(sampleSize_ > 0 ? sampleSize_ * numRemainingFeatures : log2(numRemainingFeatures - 1) + 1); + uint32 numSamples; + + if (sampleSize_ > 0) { + numSamples = util::calculateBoundedFraction(numRemainingFeatures, sampleSize_, minSamples_, maxSamples_); + } else { + numSamples = static_cast(log2(numRemainingFeatures - 1) + 1); + } + return std::make_unique(rngConfig_.get().createRNGFactory(), numFeatures, numSamples, numRetained); } diff --git a/python/subprojects/common/mlrl/common/cython/feature_sampling.pxd b/python/subprojects/common/mlrl/common/cython/feature_sampling.pxd index 1d57ef1b96..043242f695 100644 --- a/python/subprojects/common/mlrl/common/cython/feature_sampling.pxd +++ b/python/subprojects/common/mlrl/common/cython/feature_sampling.pxd @@ -11,6 +11,14 @@ cdef extern from "mlrl/common/sampling/feature_sampling_without_replacement.hpp" IFeatureSamplingWithoutReplacementConfig& setSampleSize(float32 sampleSize) except + + uint32 getMinSamples() const + + IFeatureSamplingWithoutReplacementConfig& setMinSamples(uint32 minSamples) except + + + uint32 getMaxSamples() const + + IFeatureSamplingWithoutReplacementConfig& setMaxSamples(uint32 maxSamples) except + + uint32 getNumRetained() const IFeatureSamplingWithoutReplacementConfig& setNumRetained(uint32 numRetained) except + diff --git a/python/subprojects/common/mlrl/common/cython/feature_sampling.pyx b/python/subprojects/common/mlrl/common/cython/feature_sampling.pyx index 576518b540..b8a2501e2f 100644 --- a/python/subprojects/common/mlrl/common/cython/feature_sampling.pyx +++ b/python/subprojects/common/mlrl/common/cython/feature_sampling.pyx @@ -32,6 +32,48 @@ cdef class FeatureSamplingWithoutReplacementConfig: self.config_ptr.setSampleSize(sample_size) return self + def get_min_samples(self) -> int: + """ + Returns the minimum number of features that are included in a sample. + + :return: The minimum number of features that are included in a sample + """ + return self.config_ptr.getMinSamples() + + def set_min_samples(self, min_samples: int) -> FeatureSamplingWithoutReplacementConfig: + """ + Sets the minimum number of features that should be included in a sample. + + :param min_samples: The minimum number of features that should be included in a sample. Must be at least 1 + :return: A `FeatureSamplingWithoutReplacementConfig` that allows further configuration of the method + for sampling features + """ + assert_greater_or_equal('min_samples', min_samples, 1) + self.config_ptr.setMinSamples(min_samples) + return self + + def get_max_samples(self) -> int: + """ + Returns the maximum number of features that are included in a sample. + + :return: The maximum number of features that are included in a sample + """ + return self.config_ptr.getMaxSamples() + + def set_max_samples(self, max_samples: int) -> FeatureSamplingWithoutReplacementConfig: + """ + Sets the maximum number of features that should be included in a sample. + + :param max_samples: The maximum number of features that should be included in a sample. Must be at least + `get_min_samples()` or 0, if the number of features should not be restricted + :return: A `FeatureSamplingWithoutReplacementConfig` that allows further configuration of the method + for sampling features + """ + if max_samples != 0: + assert_greater_or_equal('max_samples', max_samples, self.get_min_samples()) + self.config_ptr.setMaxSamples(max_samples) + return self + def get_num_retained(self) -> int: """ Returns the number of trailing features that are always included in a sample. From 15ae7049f0b5e787b9bc85feb5f9753eddbe662d Mon Sep 17 00:00:00 2001 From: Michael Rapp Date: Tue, 1 Oct 2024 23:55:57 +0200 Subject: [PATCH 02/12] Add functions "getMinSampleSize", "setMinSampleSize", "getMaxSampleSize" and "setMaxSampleSize" to class IInstanceSamplingWithReplacementConfig. --- .../instance_sampling_with_replacement.hpp | 45 +++++++ .../instance_sampling_with_replacement.cpp | 114 ++++++++++++------ .../mlrl/common/cython/instance_sampling.pxd | 10 +- .../mlrl/common/cython/instance_sampling.pyx | 44 ++++++- 4 files changed, 174 insertions(+), 39 deletions(-) diff --git a/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_with_replacement.hpp b/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_with_replacement.hpp index 6030380ae6..dce7079002 100644 --- a/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_with_replacement.hpp +++ b/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_with_replacement.hpp @@ -33,6 +33,39 @@ class MLRLCOMMON_API IInstanceSamplingWithReplacementConfig { * further configuration of the method for sampling instances */ virtual IInstanceSamplingWithReplacementConfig& setSampleSize(float32 sampleSize) = 0; + + /** + * Returns the minimum number of examples that are included in a sample. + * + * @return The minimum number of examples that are included in a sample + */ + virtual uint32 getMinSamples() const = 0; + + /** + * Sets the minimum number of examples that should be included in a sample. + * + * @param minSamples The minimum number of examples that should be included in a sample. Must be at least 1 + * @return A reference to an object of type `IInstanceSamplingWithReplacementConfig` that allows + * further configuration of the method for sampling instances + */ + virtual IInstanceSamplingWithReplacementConfig& setMinSamples(uint32 minSamples) = 0; + + /** + * Returns the maximum number of examples that are included in a sample. + * + * @return The maximum number of examples that are included in a sample + */ + virtual uint32 getMaxSamples() const = 0; + + /** + * Sets the maximum number of examples that should be included in a sample. + * + * @param maxSamples The maximum number of examples that should be included in a sample. Must be at the value + * returned by `getMaxSamples` or 0, if the number of examples should not be restricted + * @return A reference to an object of type `IInstanceSamplingWithReplacementConfig` that allows + * further configuration of the method for sampling instances + */ + virtual IInstanceSamplingWithReplacementConfig& setMaxSamples(uint32 maxSamples) = 0; }; /** @@ -47,6 +80,10 @@ class InstanceSamplingWithReplacementConfig final : public IClassificationInstan float32 sampleSize_; + uint32 minSamples_; + + uint32 maxSamples_; + public: /** @@ -59,6 +96,14 @@ class InstanceSamplingWithReplacementConfig final : public IClassificationInstan IInstanceSamplingWithReplacementConfig& setSampleSize(float32 sampleSize) override; + uint32 getMinSamples() const override; + + IInstanceSamplingWithReplacementConfig& setMinSamples(uint32 minSamples) override; + + uint32 getMaxSamples() const override; + + IInstanceSamplingWithReplacementConfig& setMaxSamples(uint32 maxSamples) override; + std::unique_ptr createClassificationInstanceSamplingFactory() const override; diff --git a/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_with_replacement.cpp b/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_with_replacement.cpp index c84dbbf5a3..3e1e71e2ed 100644 --- a/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_with_replacement.cpp +++ b/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_with_replacement.cpp @@ -3,12 +3,13 @@ #include "mlrl/common/sampling/partition_bi.hpp" #include "mlrl/common/sampling/partition_single.hpp" #include "mlrl/common/sampling/weight_vector_dense.hpp" +#include "mlrl/common/util/math.hpp" #include "mlrl/common/util/validation.hpp" -static inline void sampleInternally(const SinglePartition& partition, float32 sampleSize, - DenseWeightVector& weightVector, RNG& rng) { +static inline void sampleInternally(const SinglePartition& partition, float32 sampleSize, uint32 minSamples, + uint32 maxSamples, DenseWeightVector& weightVector, RNG& rng) { uint32 numExamples = partition.getNumElements(); - uint32 numSamples = static_cast(sampleSize * numExamples); + uint32 numSamples = util::calculateBoundedFraction(numExamples, sampleSize, minSamples, maxSamples); typename DenseWeightVector::iterator weightIterator = weightVector.begin(); util::setViewToZeros(weightIterator, numExamples); uint32 numNonZeroWeights = 0; @@ -29,11 +30,11 @@ static inline void sampleInternally(const SinglePartition& partition, float32 sa weightVector.setNumNonZeroWeights(numNonZeroWeights); } -static inline void sampleInternally(BiPartition& partition, float32 sampleSize, DenseWeightVector& weightVector, - RNG& rng) { +static inline void sampleInternally(BiPartition& partition, float32 sampleSize, uint32 minSamples, uint32 maxSamples, + DenseWeightVector& weightVector, RNG& rng) { uint32 numExamples = partition.getNumElements(); uint32 numTrainingExamples = partition.getNumFirst(); - uint32 numSamples = static_cast(sampleSize * numTrainingExamples); + uint32 numSamples = util::calculateBoundedFraction(numTrainingExamples, sampleSize, minSamples, maxSamples); BiPartition::const_iterator indexIterator = partition.first_cbegin(); typename DenseWeightVector::iterator weightIterator = weightVector.begin(); util::setViewToZeros(weightIterator, numExamples); @@ -72,24 +73,32 @@ class InstanceSamplingWithReplacement final : public IInstanceSampling { const float32 sampleSize_; + const uint32 minSamples_; + + const uint32 maxSamples_; + DenseWeightVector weightVector_; public: /** - * @param rngPtr An unique pointer to an object of type `RNG` that should be used for generating random - * numbers - * @param partition A reference to an object of template type `Partition` that provides access to the indices - * of the examples that are included in the training set - * @param sampleSize The fraction of examples to be included in the sample (e.g. a value of 0.6 corresponds to - * 60 % of the available examples). Must be in (0, 1] + * @param rngPtr An unique pointer to an object of type `RNG` that should be used for generating random + * numbers + * @param partition A reference to an object of template type `Partition` that provides access to the + * indices of the examples that are included in the training set + * @param sampleSize The fraction of examples to be included in the sample (e.g. a value of 0.6 corresponds + * to 60 % of the available examples). Must be in (0, 1] + * @param minSamples The minimum number of examples to be included in the sample. Must be at least 1 + * @param maxSamples The maximum number of examples to be included in the sample. Must be at least + * `minSamples` or 0, if the number of examples should not be restricted */ - InstanceSamplingWithReplacement(std::unique_ptr rngPtr, Partition& partition, float32 sampleSize) - : rngPtr_(std::move(rngPtr)), partition_(partition), sampleSize_(sampleSize), - weightVector_(partition.getNumElements()) {} + InstanceSamplingWithReplacement(std::unique_ptr rngPtr, Partition& partition, float32 sampleSize, + uint32 minSamples, uint32 maxSamples) + : rngPtr_(std::move(rngPtr)), partition_(partition), sampleSize_(sampleSize), minSamples_(minSamples), + maxSamples_(maxSamples), weightVector_(partition.getNumElements()) {} const IWeightVector& sample() override { - sampleInternally(partition_, sampleSize_, weightVector_, *rngPtr_); + sampleInternally(partition_, sampleSize_, minSamples_, maxSamples_, weightVector_, *rngPtr_); return weightVector_; } }; @@ -106,6 +115,10 @@ class InstanceSamplingWithReplacementFactory final : public IClassificationInsta const float32 sampleSize_; + const uint32 minSamples_; + + const uint32 maxSamples_; + public: /** @@ -113,64 +126,69 @@ class InstanceSamplingWithReplacementFactory final : public IClassificationInsta * generators * @param sampleSize The fraction of examples to be included in the sample (e.g. a value of 0.6 corresponds * to 60 % of the available examples). Must be in (0, 1] + * @param minSamples The minimum number of examples to be included in the sample. Must be at least 1 + * @param maxSamples The maximum number of examples to be included in the sample. Must be at least + * `minSamples` or 0, if the number of examples should not be restricted */ - InstanceSamplingWithReplacementFactory(std::unique_ptr rngFactoryPtr, float32 sampleSize) - : rngFactoryPtr_(std::move(rngFactoryPtr)), sampleSize_(sampleSize) {} + InstanceSamplingWithReplacementFactory(std::unique_ptr rngFactoryPtr, float32 sampleSize, + uint32 minSamples, uint32 maxSamples) + : rngFactoryPtr_(std::move(rngFactoryPtr)), sampleSize_(sampleSize), minSamples_(minSamples), + maxSamples_(maxSamples) {} std::unique_ptr create(const CContiguousView& labelMatrix, const SinglePartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), - partition, sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const CContiguousView& labelMatrix, BiPartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), partition, - sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const BinaryCsrView& labelMatrix, const SinglePartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), - partition, sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const BinaryCsrView& labelMatrix, BiPartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), partition, - sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const CContiguousView& regressionMatrix, const SinglePartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), - partition, sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const CContiguousView& regressionMatrix, BiPartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), partition, - sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const CsrView& regressionMatrix, const SinglePartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), - partition, sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const CsrView& regressionMatrix, BiPartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), partition, - sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } }; InstanceSamplingWithReplacementConfig::InstanceSamplingWithReplacementConfig(ReadableProperty rngConfig) - : rngConfig_(rngConfig), sampleSize_(0.66f) {} + : rngConfig_(rngConfig), sampleSize_(0.66f), minSamples_(1), maxSamples_(0) {} float32 InstanceSamplingWithReplacementConfig::getSampleSize() const { return sampleSize_; @@ -183,12 +201,34 @@ IInstanceSamplingWithReplacementConfig& InstanceSamplingWithReplacementConfig::s return *this; } +uint32 InstanceSamplingWithReplacementConfig::getMinSamples() const { + return minSamples_; +} + +IInstanceSamplingWithReplacementConfig& InstanceSamplingWithReplacementConfig::setMinSamples(uint32 minSamples) { + util::assertGreaterOrEqual("minSamples", minSamples, 1); + minSamples_ = minSamples; + return *this; +} + +uint32 InstanceSamplingWithReplacementConfig::getMaxSamples() const { + return maxSamples_; +} + +IInstanceSamplingWithReplacementConfig& InstanceSamplingWithReplacementConfig::setMaxSamples(uint32 maxSamples) { + if (maxSamples != 0) util::assertGreaterOrEqual("maxSamples", maxSamples, minSamples_); + maxSamples_ = maxSamples; + return *this; +} + std::unique_ptr InstanceSamplingWithReplacementConfig::createClassificationInstanceSamplingFactory() const { - return std::make_unique(rngConfig_.get().createRNGFactory(), sampleSize_); + return std::make_unique(rngConfig_.get().createRNGFactory(), sampleSize_, + minSamples_, maxSamples_); } std::unique_ptr InstanceSamplingWithReplacementConfig::createRegressionInstanceSamplingFactory() const { - return std::make_unique(rngConfig_.get().createRNGFactory(), sampleSize_); + return std::make_unique(rngConfig_.get().createRNGFactory(), sampleSize_, + minSamples_, maxSamples_); } diff --git a/python/subprojects/common/mlrl/common/cython/instance_sampling.pxd b/python/subprojects/common/mlrl/common/cython/instance_sampling.pxd index 14c1a2820e..921c190c3e 100644 --- a/python/subprojects/common/mlrl/common/cython/instance_sampling.pxd +++ b/python/subprojects/common/mlrl/common/cython/instance_sampling.pxd @@ -1,4 +1,4 @@ -from mlrl.common.cython._types cimport float32 +from mlrl.common.cython._types cimport float32, uint32 cdef extern from "mlrl/common/sampling/instance_sampling_stratified_example_wise.hpp" nogil: @@ -33,6 +33,14 @@ cdef extern from "mlrl/common/sampling/instance_sampling_with_replacement.hpp" n IInstanceSamplingWithReplacementConfig& setSampleSize(float32 sampleSize) + uint32 getMinSamples() const + + IExampleWiseStratifiedInstanceSamplingConfig& setMinSamples(float32 minSamples) + + uint32 getMaxSamples() const + + IExampleWiseStratifiedInstanceSamplingConfig& setMaxSamples(float32 maxSamples) + cdef extern from "mlrl/common/sampling/instance_sampling_without_replacement.hpp" nogil: diff --git a/python/subprojects/common/mlrl/common/cython/instance_sampling.pyx b/python/subprojects/common/mlrl/common/cython/instance_sampling.pyx index 3085b4366e..63f41cbf19 100644 --- a/python/subprojects/common/mlrl/common/cython/instance_sampling.pyx +++ b/python/subprojects/common/mlrl/common/cython/instance_sampling.pyx @@ -1,7 +1,7 @@ """ @author: Michael Rapp (michael.rapp.ml@gmail.com) """ -from mlrl.common.cython.validation import assert_greater, assert_less +from mlrl.common.cython.validation import assert_greater, assert_greater_or_equal, assert_less cdef class ExampleWiseStratifiedInstanceSamplingConfig: @@ -89,6 +89,48 @@ cdef class InstanceSamplingWithReplacementConfig: self.config_ptr.setSampleSize(sample_size) return self + def get_min_samples(self) -> int: + """ + Returns the minimum number of examples that are included in a sample. + + :return: The minimum number of examples that are included in a sample + """ + return self.config_ptr.getMinSamples() + + def set_min_samples(self, min_samples: int) -> InstanceSamplingWithReplacementConfig: + """ + Sets the minimum number of examples that should be included in a sample. + + :param min_samples: The minimum number of examples that should be included in a sample. Must be at least 1 + :return: An `InstanceSamplingWithReplacementConfig` that allows further configuration of the method + for sampling instances + """ + assert_greater_or_equal('min_samples', min_samples, 1) + self.config_ptr.setMinSamples(min_samples) + return self + + def get_max_samples(self) -> int: + """ + Returns the maximum number of examples that are included in a sample. + + :return: The maximum number of examples that are included in a sample + """ + return self.config_ptr.getMaxSamples() + + def set_max_samples(self, max_samples: int) -> InstanceSamplingWithReplacementConfig: + """ + Sets the maximum number of examples that should be included in a sample. + + :param max_samples: The maximum number of examples that should be included in a sample. Must be at least + `get_min_samples()` or 0, if the number of examples should not be restricted + :return: An `InstanceSamplingWithReplacementConfig` that allows further configuration of the method + for sampling instances + """ + if max_samples != 0: + assert_greater_or_equal('max_samples', max_samples, self.get_min_samples()) + self.config_ptr.setMaxSamples(max_samples) + return self + cdef class InstanceSamplingWithoutReplacementConfig: """ From 1498542b066d162d75e4bc1652b02d850bfda767 Mon Sep 17 00:00:00 2001 From: Michael Rapp Date: Tue, 1 Oct 2024 23:58:56 +0200 Subject: [PATCH 03/12] Add functions "getMinSampleSize", "setMinSampleSize", "getMaxSampleSize" and "setMaxSampleSize" to class IInstanceSamplingWithoutReplacementConfig. --- .../instance_sampling_without_replacement.hpp | 45 ++++++++ .../instance_sampling_without_replacement.cpp | 104 ++++++++++++------ .../mlrl/common/cython/instance_sampling.pxd | 8 ++ .../mlrl/common/cython/instance_sampling.pyx | 42 +++++++ 4 files changed, 166 insertions(+), 33 deletions(-) diff --git a/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_without_replacement.hpp b/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_without_replacement.hpp index 4156b90b3b..13f08f8252 100644 --- a/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_without_replacement.hpp +++ b/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_without_replacement.hpp @@ -33,6 +33,39 @@ class MLRLCOMMON_API IInstanceSamplingWithoutReplacementConfig { * further configuration of the method for sampling instances */ virtual IInstanceSamplingWithoutReplacementConfig& setSampleSize(float32 sampleSize) = 0; + + /** + * Returns the minimum number of examples that are included in a sample. + * + * @return The minimum number of examples that are included in a sample + */ + virtual uint32 getMinSamples() const = 0; + + /** + * Sets the minimum number of examples that should be included in a sample. + * + * @param minSamples The minimum number of examples that should be included in a sample. Must be at least 1 + * @return A reference to an object of type `IInstanceSamplingWithoutReplacementConfig` that allows + * further configuration of the method for sampling instances + */ + virtual IInstanceSamplingWithoutReplacementConfig& setMinSamples(uint32 minSamples) = 0; + + /** + * Returns the maximum number of examples that are included in a sample. + * + * @return The maximum number of examples that are included in a sample + */ + virtual uint32 getMaxSamples() const = 0; + + /** + * Sets the maximum number of examples that should be included in a sample. + * + * @param maxSamples The maximum number of examples that should be included in a sample. Must be at the value + * returned by `getMaxSamples` or 0, if the number of examples should not be restricted + * @return A reference to an object of type `IInstanceSamplingWithoutReplacementConfig` that allows + * further configuration of the method for sampling instances + */ + virtual IInstanceSamplingWithoutReplacementConfig& setMaxSamples(uint32 maxSamples) = 0; }; /** @@ -47,6 +80,10 @@ class InstanceSamplingWithoutReplacementConfig final : public IClassificationIns float32 sampleSize_; + uint32 minSamples_; + + uint32 maxSamples_; + public: /** @@ -59,6 +96,14 @@ class InstanceSamplingWithoutReplacementConfig final : public IClassificationIns IInstanceSamplingWithoutReplacementConfig& setSampleSize(float32 sampleSize) override; + uint32 getMinSamples() const override; + + IInstanceSamplingWithoutReplacementConfig& setMinSamples(uint32 minSamples) override; + + uint32 getMaxSamples() const override; + + IInstanceSamplingWithoutReplacementConfig& setMaxSamples(uint32 maxSamples) override; + std::unique_ptr createClassificationInstanceSamplingFactory() const override; diff --git a/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_without_replacement.cpp b/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_without_replacement.cpp index 725d80733a..52fd450322 100644 --- a/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_without_replacement.cpp +++ b/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_without_replacement.cpp @@ -4,20 +4,21 @@ #include "mlrl/common/sampling/partition_bi.hpp" #include "mlrl/common/sampling/partition_single.hpp" #include "mlrl/common/sampling/weight_sampling.hpp" +#include "mlrl/common/util/math.hpp" #include "mlrl/common/util/validation.hpp" -static inline void sampleInternally(const SinglePartition& partition, float32 sampleSize, BitWeightVector& weightVector, - RNG& rng) { +static inline void sampleInternally(const SinglePartition& partition, float32 sampleSize, uint32 minSamples, + uint32 maxSamples, BitWeightVector& weightVector, RNG& rng) { uint32 numExamples = partition.getNumElements(); - uint32 numSamples = static_cast(sampleSize * numExamples); + uint32 numSamples = util::calculateBoundedFraction(numExamples, sampleSize, minSamples, maxSamples); sampleWeightsWithoutReplacement(weightVector, partition.cbegin(), numExamples, numSamples, rng); } -static inline void sampleInternally(BiPartition& partition, float32 sampleSize, BitWeightVector& weightVector, - RNG& rng) { +static inline void sampleInternally(BiPartition& partition, float32 sampleSize, uint32 minSamples, uint32 maxSamples, + BitWeightVector& weightVector, RNG& rng) { uint32 numTrainingExamples = partition.getNumFirst(); - uint32 numSamples = static_cast(sampleSize * numTrainingExamples); + uint32 numSamples = util::calculateBoundedFraction(numTrainingExamples, sampleSize, minSamples, maxSamples); sampleWeightsWithoutReplacement(weightVector, partition.first_cbegin(), numTrainingExamples, numSamples, rng); } @@ -38,6 +39,10 @@ class InstanceSamplingWithoutReplacement final : public IInstanceSampling { const float32 sampleSize_; + const uint32 minSamples_; + + const uint32 maxSamples_; + BitWeightVector weightVector_; public: @@ -49,13 +54,17 @@ class InstanceSamplingWithoutReplacement final : public IInstanceSampling { * indices of the examples that are included in the training set * @param sampleSize The fraction of examples to be included in the sample (e.g. a value of 0.6 corresponds * to 60 % of the available examples). Must be in (0, 1) + * @param minSamples The minimum number of examples to be included in the sample. Must be at least 1 + * @param maxSamples The maximum number of examples to be included in the sample. Must be at least + * `minSamples` or 0, if the number of examples should not be restricted */ - InstanceSamplingWithoutReplacement(std::unique_ptr rngPtr, Partition& partition, float32 sampleSize) - : rngPtr_(std::move(rngPtr)), partition_(partition), sampleSize_(sampleSize), - weightVector_(partition.getNumElements()) {} + InstanceSamplingWithoutReplacement(std::unique_ptr rngPtr, Partition& partition, float32 sampleSize, + uint32 minSamples, uint32 maxSamples) + : rngPtr_(std::move(rngPtr)), partition_(partition), sampleSize_(sampleSize), minSamples_(minSamples), + maxSamples_(maxSamples), weightVector_(partition.getNumElements()) {} const IWeightVector& sample() override { - sampleInternally(partition_, sampleSize_, weightVector_, *rngPtr_); + sampleInternally(partition_, sampleSize_, minSamples_, maxSamples_, weightVector_, *rngPtr_); return weightVector_; } }; @@ -72,6 +81,10 @@ class InstanceSamplingWithoutReplacementFactory final : public IClassificationIn const float32 sampleSize_; + const uint32 minSamples_; + + const uint32 maxSamples_; + public: /** @@ -79,65 +92,70 @@ class InstanceSamplingWithoutReplacementFactory final : public IClassificationIn * generators * @param sampleSize The fraction of examples to be included in the sample (e.g. a value of 0.6 corresponds * to 60 % of the available examples). Must be in (0, 1) + * @param minSamples The minimum number of examples to be included in the sample. Must be at least 1 + * @param maxSamples The maximum number of examples to be included in the sample. Must be at least + * `minSamples` or 0, if the number of examples should not be restricted */ - InstanceSamplingWithoutReplacementFactory(std::unique_ptr rngFactoryPtr, float32 sampleSize) - : rngFactoryPtr_(std::move(rngFactoryPtr)), sampleSize_(sampleSize) {} + InstanceSamplingWithoutReplacementFactory(std::unique_ptr rngFactoryPtr, float32 sampleSize, + uint32 minSamples, uint32 maxSamples) + : rngFactoryPtr_(std::move(rngFactoryPtr)), sampleSize_(sampleSize), minSamples_(minSamples), + maxSamples_(maxSamples) {} std::unique_ptr create(const CContiguousView& labelMatrix, const SinglePartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), - partition, sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const CContiguousView& labelMatrix, BiPartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), - partition, sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const BinaryCsrView& labelMatrix, const SinglePartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), - partition, sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const BinaryCsrView& labelMatrix, BiPartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), - partition, sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const CContiguousView& regressionMatrix, const SinglePartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), - partition, sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const CContiguousView& regressionMatrix, BiPartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), - partition, sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const CsrView& regressionMatrix, const SinglePartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), - partition, sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } std::unique_ptr create(const CsrView& regressionMatrix, BiPartition& partition, IStatistics& statistics) const override { - return std::make_unique>(rngFactoryPtr_->create(), - partition, sampleSize_); + return std::make_unique>( + rngFactoryPtr_->create(), partition, sampleSize_, minSamples_, maxSamples_); } }; InstanceSamplingWithoutReplacementConfig::InstanceSamplingWithoutReplacementConfig( ReadableProperty rngConfig) - : rngConfig_(rngConfig), sampleSize_(0.66f) {} + : rngConfig_(rngConfig), sampleSize_(0.66f), minSamples_(1), maxSamples_(0) {} float32 InstanceSamplingWithoutReplacementConfig::getSampleSize() const { return sampleSize_; @@ -150,14 +168,34 @@ IInstanceSamplingWithoutReplacementConfig& InstanceSamplingWithoutReplacementCon return *this; } +uint32 InstanceSamplingWithoutReplacementConfig::getMinSamples() const { + return minSamples_; +} + +IInstanceSamplingWithoutReplacementConfig& InstanceSamplingWithoutReplacementConfig::setMinSamples(uint32 minSamples) { + util::assertGreaterOrEqual("minSamples", minSamples, 1); + minSamples_ = minSamples; + return *this; +} + +uint32 InstanceSamplingWithoutReplacementConfig::getMaxSamples() const { + return maxSamples_; +} + +IInstanceSamplingWithoutReplacementConfig& InstanceSamplingWithoutReplacementConfig::setMaxSamples(uint32 maxSamples) { + if (maxSamples != 0) util::assertGreaterOrEqual("maxSamples", maxSamples, minSamples_); + maxSamples_ = maxSamples; + return *this; +} + std::unique_ptr InstanceSamplingWithoutReplacementConfig::createClassificationInstanceSamplingFactory() const { - return std::make_unique(rngConfig_.get().createRNGFactory(), - sampleSize_); + return std::make_unique(rngConfig_.get().createRNGFactory(), sampleSize_, + minSamples_, maxSamples_); } std::unique_ptr InstanceSamplingWithoutReplacementConfig::createRegressionInstanceSamplingFactory() const { - return std::make_unique(rngConfig_.get().createRNGFactory(), - sampleSize_); + return std::make_unique(rngConfig_.get().createRNGFactory(), sampleSize_, + minSamples_, maxSamples_); } diff --git a/python/subprojects/common/mlrl/common/cython/instance_sampling.pxd b/python/subprojects/common/mlrl/common/cython/instance_sampling.pxd index 921c190c3e..9921ad1c30 100644 --- a/python/subprojects/common/mlrl/common/cython/instance_sampling.pxd +++ b/python/subprojects/common/mlrl/common/cython/instance_sampling.pxd @@ -52,6 +52,14 @@ cdef extern from "mlrl/common/sampling/instance_sampling_without_replacement.hpp IInstanceSamplingWithoutReplacementConfig& setSampleSize(float32 sampleSize) + uint32 getMinSamples() const + + IExampleWiseStratifiedInstanceSamplingConfig& setMinSamples(float32 minSamples) + + uint32 getMaxSamples() const + + IExampleWiseStratifiedInstanceSamplingConfig& setMaxSamples(float32 maxSamples) + cdef class ExampleWiseStratifiedInstanceSamplingConfig: diff --git a/python/subprojects/common/mlrl/common/cython/instance_sampling.pyx b/python/subprojects/common/mlrl/common/cython/instance_sampling.pyx index 63f41cbf19..33c48006c6 100644 --- a/python/subprojects/common/mlrl/common/cython/instance_sampling.pyx +++ b/python/subprojects/common/mlrl/common/cython/instance_sampling.pyx @@ -158,3 +158,45 @@ cdef class InstanceSamplingWithoutReplacementConfig: assert_less('sample_size', sample_size, 1) self.config_ptr.setSampleSize(sample_size) return self + + def get_min_samples(self) -> int: + """ + Returns the minimum number of examples that are included in a sample. + + :return: The minimum number of examples that are included in a sample + """ + return self.config_ptr.getMinSamples() + + def set_min_samples(self, min_samples: int) -> InstanceSamplingWithoutReplacementConfig: + """ + Sets the minimum number of examples that should be included in a sample. + + :param min_samples: The minimum number of examples that should be included in a sample. Must be at least 1 + :return: An `InstanceSamplingWithReplacementConfig` that allows further configuration of the method + for sampling instances + """ + assert_greater_or_equal('min_samples', min_samples, 1) + self.config_ptr.setMinSamples(min_samples) + return self + + def get_max_samples(self) -> int: + """ + Returns the maximum number of examples that are included in a sample. + + :return: The maximum number of examples that are included in a sample + """ + return self.config_ptr.getMaxSamples() + + def set_max_samples(self, max_samples: int) -> InstanceSamplingWithoutReplacementConfig: + """ + Sets the maximum number of examples that should be included in a sample. + + :param max_samples: The maximum number of examples that should be included in a sample. Must be at least + `get_min_samples()` or 0, if the number of examples should not be restricted + :return: An `InstanceSamplingWithReplacementConfig` that allows further configuration of the method + for sampling instances + """ + if max_samples != 0: + assert_greater_or_equal('max_samples', max_samples, self.get_min_samples()) + self.config_ptr.setMaxSamples(max_samples) + return self From 016932062159910bdac1654abe6179b7056b2e53 Mon Sep 17 00:00:00 2001 From: Michael Rapp Date: Wed, 2 Oct 2024 00:37:15 +0200 Subject: [PATCH 04/12] Add arguments "minSamples" and "maxSamples" to function "sampleWeights" of the class LabelWiseStratification. --- .../common/sampling/stratified_sampling_output_wise.hpp | 5 ++++- .../common/sampling/stratified_sampling_output_wise.cpp | 6 ++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/cpp/subprojects/common/include/mlrl/common/sampling/stratified_sampling_output_wise.hpp b/cpp/subprojects/common/include/mlrl/common/sampling/stratified_sampling_output_wise.hpp index b75f40177c..d6bd4dc180 100644 --- a/cpp/subprojects/common/include/mlrl/common/sampling/stratified_sampling_output_wise.hpp +++ b/cpp/subprojects/common/include/mlrl/common/sampling/stratified_sampling_output_wise.hpp @@ -49,8 +49,11 @@ class LabelWiseStratification final { * * @param weightVector A reference to an object of type `BitWeightVector`, the weights should be written to * @param sampleSize The fraction of the available examples to be selected + * @param minSamples The minimum number of examples to be included in the sample. Must be at least 1 + * @param maxSamples The maximum number of examples to be included in the sample. Must be at least + * `minSamples` or 0, if the number of examples should not be restricted */ - void sampleWeights(BitWeightVector& weightVector, float32 sampleSize); + void sampleWeights(BitWeightVector& weightVector, float32 sampleSize, uint32 minSamples, uint32 maxSamples); /** * Randomly splits the available examples into two distinct sets and updates a given `BiPartition` accordingly. diff --git a/cpp/subprojects/common/src/mlrl/common/sampling/stratified_sampling_output_wise.cpp b/cpp/subprojects/common/src/mlrl/common/sampling/stratified_sampling_output_wise.cpp index fc487ae9ed..446b272f94 100644 --- a/cpp/subprojects/common/src/mlrl/common/sampling/stratified_sampling_output_wise.cpp +++ b/cpp/subprojects/common/src/mlrl/common/sampling/stratified_sampling_output_wise.cpp @@ -3,6 +3,7 @@ #include "mlrl/common/data/array.hpp" #include "mlrl/common/data/indexed_value.hpp" #include "mlrl/common/sampling/partition_single.hpp" +#include "mlrl/common/util/math.hpp" #include "stratified_sampling_common.hpp" #include @@ -376,10 +377,11 @@ LabelWiseStratification::LabelWiseStratification(std template void LabelWiseStratification::sampleWeights(BitWeightVector& weightVector, - float32 sampleSize) { + float32 sampleSize, uint32 minSamples, + uint32 maxSamples) { uint32 numRows = stratificationMatrix_.getNumRows(); uint32 numCols = stratificationMatrix_.getNumCols(); - uint32 numTotalSamples = static_cast(std::round(sampleSize * numRows)); + uint32 numTotalSamples = util::calculateBoundedFraction(numRows, sampleSize, minSamples, maxSamples); uint32 numTotalOutOfSamples = numRows - numTotalSamples; uint32 numNonZeroWeights = 0; uint32 numZeroWeights = 0; From a1778979d1daf35f911fdd0c879730dd1ebd96bd Mon Sep 17 00:00:00 2001 From: Michael Rapp Date: Wed, 2 Oct 2024 00:01:23 +0200 Subject: [PATCH 05/12] Add functions "getMinSampleSize", "setMinSampleSize", "getMaxSampleSize" and "setMaxSampleSize" to class IOutputWiseStratifiedInstanceSamplingConfig. --- ...stance_sampling_stratified_output_wise.hpp | 45 ++++++++++++++ ...stance_sampling_stratified_output_wise.cpp | 62 +++++++++++++++---- .../mlrl/common/cython/instance_sampling.pxd | 8 +++ .../mlrl/common/cython/instance_sampling.pyx | 42 +++++++++++++ 4 files changed, 146 insertions(+), 11 deletions(-) diff --git a/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_stratified_output_wise.hpp b/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_stratified_output_wise.hpp index 7b0a778f0e..8a1c798e84 100644 --- a/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_stratified_output_wise.hpp +++ b/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_stratified_output_wise.hpp @@ -35,6 +35,39 @@ class MLRLCOMMON_API IOutputWiseStratifiedInstanceSamplingConfig { * allows further configuration of the method for sampling instances */ virtual IOutputWiseStratifiedInstanceSamplingConfig& setSampleSize(float32 sampleSize) = 0; + + /** + * Returns the minimum number of examples that are included in a sample. + * + * @return The minimum number of examples that are included in a sample + */ + virtual uint32 getMinSamples() const = 0; + + /** + * Sets the minimum number of examples that should be included in a sample. + * + * @param minSamples The minimum number of examples that should be included in a sample. Must be at least 1 + * @return A reference to an object of type `IOutputWiseStratifiedInstanceSamplingConfig` that + * allows further configuration of the method for sampling instances + */ + virtual IOutputWiseStratifiedInstanceSamplingConfig& setMinSamples(uint32 minSamples) = 0; + + /** + * Returns the maximum number of examples that are included in a sample. + * + * @return The maximum number of examples that are included in a sample + */ + virtual uint32 getMaxSamples() const = 0; + + /** + * Sets the maximum number of examples that should be included in a sample. + * + * @param maxSamples The maximum number of examples that should be included in a sample. Must be at the value + * returned by `getMaxSamples` or 0, if the number of examples should not be restricted + * @return A reference to an object of type `IOutputWiseStratifiedInstanceSamplingConfig` that + * allows further configuration of the method for sampling instances + */ + virtual IOutputWiseStratifiedInstanceSamplingConfig& setMaxSamples(uint32 maxSamples) = 0; }; /** @@ -49,6 +82,10 @@ class OutputWiseStratifiedInstanceSamplingConfig final : public IClassificationI float32 sampleSize_; + uint32 minSamples_; + + uint32 maxSamples_; + public: /** @@ -61,6 +98,14 @@ class OutputWiseStratifiedInstanceSamplingConfig final : public IClassificationI IOutputWiseStratifiedInstanceSamplingConfig& setSampleSize(float32 sampleSize) override; + uint32 getMinSamples() const override; + + IOutputWiseStratifiedInstanceSamplingConfig& setMinSamples(uint32 minSamples) override; + + uint32 getMaxSamples() const override; + + IOutputWiseStratifiedInstanceSamplingConfig& setMaxSamples(uint32 maxSamples) override; + std::unique_ptr createClassificationInstanceSamplingFactory() const override; }; diff --git a/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_stratified_output_wise.cpp b/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_stratified_output_wise.cpp index 4a675374ca..8ae84de224 100644 --- a/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_stratified_output_wise.cpp +++ b/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_stratified_output_wise.cpp @@ -20,6 +20,10 @@ class OutputWiseStratifiedSampling final : public IInstanceSampling { const float32 sampleSize_; + const uint32 minSamples_; + + const uint32 maxSamples_; + BitWeightVector weightVector_; LabelWiseStratification stratification_; @@ -39,13 +43,14 @@ class OutputWiseStratifiedSampling final : public IInstanceSampling { * corresponds to 60 % of the available examples). Must be in (0, 1] */ OutputWiseStratifiedSampling(std::unique_ptr rngPtr, const LabelMatrix& labelMatrix, - IndexIterator indicesBegin, IndexIterator indicesEnd, float32 sampleSize) - : sampleSize_(sampleSize), + IndexIterator indicesBegin, IndexIterator indicesEnd, float32 sampleSize, + uint32 minSamples, uint32 maxSamples) + : sampleSize_(sampleSize), minSamples_(minSamples), maxSamples_(maxSamples), weightVector_(labelMatrix.numRows, static_cast(indicesEnd - indicesBegin) < labelMatrix.numRows), stratification_(std::move(rngPtr), labelMatrix, indicesBegin, indicesEnd) {} const IWeightVector& sample() override { - stratification_.sampleWeights(weightVector_, sampleSize_); + stratification_.sampleWeights(weightVector_, sampleSize_, minSamples_, maxSamples_); return weightVector_; } }; @@ -62,6 +67,10 @@ class OutputWiseStratifiedInstanceSamplingFactory final : public IClassification const float32 sampleSize_; + const uint32 minSamples_; + + const uint32 maxSamples_; + public: /** @@ -69,41 +78,50 @@ class OutputWiseStratifiedInstanceSamplingFactory final : public IClassification * generators * @param sampleSize The fraction of examples to be included in the sample (e.g. a value of 0.6 corresponds * to 60 % of the available examples). Must be in (0, 1] + * @param minSamples The minimum number of examples to be included in the sample. Must be at least 1 + * @param maxSamples The maximum number of examples to be included in the sample. Must be at least + * `minSamples` or 0, if the number of examples should not be restricted */ - OutputWiseStratifiedInstanceSamplingFactory(std::unique_ptr rngFactoryPtr, float32 sampleSize) - : rngFactoryPtr_(std::move(rngFactoryPtr)), sampleSize_(sampleSize) {} + OutputWiseStratifiedInstanceSamplingFactory(std::unique_ptr rngFactoryPtr, float32 sampleSize, + uint32 minSamples, uint32 maxSamples) + : rngFactoryPtr_(std::move(rngFactoryPtr)), sampleSize_(sampleSize), minSamples_(minSamples), + maxSamples_(maxSamples) {} std::unique_ptr create(const CContiguousView& labelMatrix, const SinglePartition& partition, IStatistics& statistics) const override { return std::make_unique< OutputWiseStratifiedSampling, SinglePartition::const_iterator>>( - rngFactoryPtr_->create(), labelMatrix, partition.cbegin(), partition.cend(), sampleSize_); + rngFactoryPtr_->create(), labelMatrix, partition.cbegin(), partition.cend(), sampleSize_, minSamples_, + maxSamples_); } std::unique_ptr create(const CContiguousView& labelMatrix, BiPartition& partition, IStatistics& statistics) const override { return std::make_unique< OutputWiseStratifiedSampling, BiPartition::const_iterator>>( - rngFactoryPtr_->create(), labelMatrix, partition.first_cbegin(), partition.first_cend(), sampleSize_); + rngFactoryPtr_->create(), labelMatrix, partition.first_cbegin(), partition.first_cend(), sampleSize_, + minSamples_, maxSamples_); } std::unique_ptr create(const BinaryCsrView& labelMatrix, const SinglePartition& partition, IStatistics& statistics) const override { return std::make_unique>( - rngFactoryPtr_->create(), labelMatrix, partition.cbegin(), partition.cend(), sampleSize_); + rngFactoryPtr_->create(), labelMatrix, partition.cbegin(), partition.cend(), sampleSize_, minSamples_, + maxSamples_); } std::unique_ptr create(const BinaryCsrView& labelMatrix, BiPartition& partition, IStatistics& statistics) const override { return std::make_unique>( - rngFactoryPtr_->create(), labelMatrix, partition.first_cbegin(), partition.first_cend(), sampleSize_); + rngFactoryPtr_->create(), labelMatrix, partition.first_cbegin(), partition.first_cend(), sampleSize_, + minSamples_, maxSamples_); } }; OutputWiseStratifiedInstanceSamplingConfig::OutputWiseStratifiedInstanceSamplingConfig( ReadableProperty rngConfig) - : rngConfig_(rngConfig), sampleSize_(0.66f) {} + : rngConfig_(rngConfig), sampleSize_(0.66f), minSamples_(1), maxSamples_(0) {} float32 OutputWiseStratifiedInstanceSamplingConfig::getSampleSize() const { return sampleSize_; @@ -117,8 +135,30 @@ IOutputWiseStratifiedInstanceSamplingConfig& OutputWiseStratifiedInstanceSamplin return *this; } +uint32 OutputWiseStratifiedInstanceSamplingConfig::getMinSamples() const { + return minSamples_; +} + +IOutputWiseStratifiedInstanceSamplingConfig& OutputWiseStratifiedInstanceSamplingConfig::setMinSamples( + uint32 minSamples) { + util::assertGreaterOrEqual("minSamples", minSamples, 1); + minSamples_ = minSamples; + return *this; +} + +uint32 OutputWiseStratifiedInstanceSamplingConfig::getMaxSamples() const { + return maxSamples_; +} + +IOutputWiseStratifiedInstanceSamplingConfig& OutputWiseStratifiedInstanceSamplingConfig::setMaxSamples( + uint32 maxSamples) { + if (maxSamples != 0) util::assertGreaterOrEqual("maxSamples", maxSamples, minSamples_); + maxSamples_ = maxSamples; + return *this; +} + std::unique_ptr OutputWiseStratifiedInstanceSamplingConfig::createClassificationInstanceSamplingFactory() const { return std::make_unique(rngConfig_.get().createRNGFactory(), - sampleSize_); + sampleSize_, minSamples_, maxSamples_); } diff --git a/python/subprojects/common/mlrl/common/cython/instance_sampling.pxd b/python/subprojects/common/mlrl/common/cython/instance_sampling.pxd index 9921ad1c30..609067efbc 100644 --- a/python/subprojects/common/mlrl/common/cython/instance_sampling.pxd +++ b/python/subprojects/common/mlrl/common/cython/instance_sampling.pxd @@ -22,6 +22,14 @@ cdef extern from "mlrl/common/sampling/instance_sampling_stratified_output_wise. IOutputWiseStratifiedInstanceSamplingConfig& setSampleSize(float32 sampleSize) + uint32 getMinSamples() const + + IExampleWiseStratifiedInstanceSamplingConfig& setMinSamples(float32 minSamples) + + uint32 getMaxSamples() const + + IExampleWiseStratifiedInstanceSamplingConfig& setMaxSamples(float32 maxSamples) + cdef extern from "mlrl/common/sampling/instance_sampling_with_replacement.hpp" nogil: diff --git a/python/subprojects/common/mlrl/common/cython/instance_sampling.pyx b/python/subprojects/common/mlrl/common/cython/instance_sampling.pyx index 33c48006c6..25221a16a1 100644 --- a/python/subprojects/common/mlrl/common/cython/instance_sampling.pyx +++ b/python/subprojects/common/mlrl/common/cython/instance_sampling.pyx @@ -61,6 +61,48 @@ cdef class OutputWiseStratifiedInstanceSamplingConfig: self.config_ptr.setSampleSize(sample_size) return self + def get_min_samples(self) -> int: + """ + Returns the minimum number of examples that are included in a sample. + + :return: The minimum number of examples that are included in a sample + """ + return self.config_ptr.getMinSamples() + + def set_min_samples(self, min_samples: int) -> OutputWiseStratifiedInstanceSamplingConfig: + """ + Sets the minimum number of examples that should be included in a sample. + + :param min_samples: The minimum number of examples that should be included in a sample. Must be at least 1 + :return: An `OutputWiseStratifiedInstanceSamplingConfig` that allows further configuration of the + method for sampling instances + """ + assert_greater_or_equal('min_samples', min_samples, 1) + self.config_ptr.setMinSamples(min_samples) + return self + + def get_max_samples(self) -> int: + """ + Returns the maximum number of examples that are included in a sample. + + :return: The maximum number of examples that are included in a sample + """ + return self.config_ptr.getMaxSamples() + + def set_max_samples(self, max_samples: int) -> OutputWiseStratifiedInstanceSamplingConfig: + """ + Sets the maximum number of examples that should be included in a sample. + + :param max_samples: The maximum number of examples that should be included in a sample. Must be at least + `get_min_samples()` or 0, if the number of examples should not be restricted + :return: An `OutputWiseStratifiedInstanceSamplingConfig` that allows further configuration of the + method for sampling instances + """ + if max_samples != 0: + assert_greater_or_equal('max_samples', max_samples, self.get_min_samples()) + self.config_ptr.setMaxSamples(max_samples) + return self + cdef class InstanceSamplingWithReplacementConfig: """ From 85d00b9d62e3326c57bd10b870f9f342e5f57e39 Mon Sep 17 00:00:00 2001 From: Michael Rapp Date: Wed, 2 Oct 2024 00:55:52 +0200 Subject: [PATCH 06/12] Add arguments "minSamples" and "maxSamples" to function "sampleWeights" of the class ExampleWiseStratification. --- .../common/sampling/stratified_sampling_example_wise.hpp | 6 +++++- .../sampling/instance_sampling_stratified_example_wise.cpp | 2 +- .../common/sampling/stratified_sampling_example_wise.cpp | 6 ++++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/cpp/subprojects/common/include/mlrl/common/sampling/stratified_sampling_example_wise.hpp b/cpp/subprojects/common/include/mlrl/common/sampling/stratified_sampling_example_wise.hpp index dc4c138ab7..1e1d97065b 100644 --- a/cpp/subprojects/common/include/mlrl/common/sampling/stratified_sampling_example_wise.hpp +++ b/cpp/subprojects/common/include/mlrl/common/sampling/stratified_sampling_example_wise.hpp @@ -46,8 +46,12 @@ class ExampleWiseStratification final { * * @param weightVector A reference to an object of type `BitWeightVector`, the weights should be written to * @param sampleSize The fraction of the available examples to be selected + * @param minSamples The minimum number of examples to be included in the sample. Must be at least 1 + * @param maxSamples The maximum number of examples to be included in the sample. Must be at least + * `minSamples` or 0, if the number of examples should not be restricted */ - void sampleWeights(BitWeightVector& weightVector, float32 sampleSize) const; + void sampleWeights(BitWeightVector& weightVector, float32 sampleSize, uint32 minSamples, + uint32 maxSamples) const; /** * Randomly splits the available examples into two distinct sets and updates a given `BiPartition` accordingly. diff --git a/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_stratified_example_wise.cpp b/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_stratified_example_wise.cpp index 0d9cafca97..c146485607 100644 --- a/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_stratified_example_wise.cpp +++ b/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_stratified_example_wise.cpp @@ -44,7 +44,7 @@ class ExampleWiseStratifiedSampling final : public IInstanceSampling { stratification_(std::move(rngPtr), labelMatrix, indicesBegin, indicesEnd) {} const IWeightVector& sample() override { - stratification_.sampleWeights(weightVector_, sampleSize_); + stratification_.sampleWeights(weightVector_, sampleSize_, 1, 0); return weightVector_; } }; diff --git a/cpp/subprojects/common/src/mlrl/common/sampling/stratified_sampling_example_wise.cpp b/cpp/subprojects/common/src/mlrl/common/sampling/stratified_sampling_example_wise.cpp index 18b78a3ca2..d5fb5d3773 100644 --- a/cpp/subprojects/common/src/mlrl/common/sampling/stratified_sampling_example_wise.cpp +++ b/cpp/subprojects/common/src/mlrl/common/sampling/stratified_sampling_example_wise.cpp @@ -1,6 +1,7 @@ #include "mlrl/common/sampling/stratified_sampling_example_wise.hpp" #include "mlrl/common/sampling/partition_single.hpp" +#include "mlrl/common/util/math.hpp" #include "stratified_sampling_common.hpp" #include @@ -46,8 +47,9 @@ ExampleWiseStratification::ExampleWiseStratification template void ExampleWiseStratification::sampleWeights(BitWeightVector& weightVector, - float32 sampleSize) const { - uint32 numTotalSamples = static_cast(std::round(sampleSize * numTotal_)); + float32 sampleSize, uint32 minSamples, + uint32 maxSamples) const { + uint32 numTotalSamples = util::calculateBoundedFraction(numTotal_, sampleSize, minSamples, maxSamples); uint32 numTotalOutOfSamples = numTotal_ - numTotalSamples; uint32 numNonZeroWeights = 0; uint32 numZeroWeights = 0; From a1c9b4eccdb92dfd51747d00de058611fff6b615 Mon Sep 17 00:00:00 2001 From: Michael Rapp Date: Wed, 2 Oct 2024 00:03:57 +0200 Subject: [PATCH 07/12] Add functions "getMinSampleSize", "setMinSampleSize", "getMaxSampleSize" and "setMaxSampleSize" to class IExampleWiseStratifiedInstanceSamplingConfig. --- ...tance_sampling_stratified_example_wise.hpp | 45 +++++++++++++ ...tance_sampling_stratified_example_wise.cpp | 65 +++++++++++++++---- .../mlrl/common/cython/instance_sampling.pxd | 8 +++ .../mlrl/common/cython/instance_sampling.pyx | 42 ++++++++++++ 4 files changed, 149 insertions(+), 11 deletions(-) diff --git a/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_stratified_example_wise.hpp b/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_stratified_example_wise.hpp index bd714c3d00..081651f65b 100644 --- a/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_stratified_example_wise.hpp +++ b/cpp/subprojects/common/include/mlrl/common/sampling/instance_sampling_stratified_example_wise.hpp @@ -34,6 +34,39 @@ class MLRLCOMMON_API IExampleWiseStratifiedInstanceSamplingConfig { * allows further configuration of the method for sampling instances */ virtual IExampleWiseStratifiedInstanceSamplingConfig& setSampleSize(float32 sampleSize) = 0; + + /** + * Returns the minimum number of examples that are included in a sample. + * + * @return The minimum number of examples that are included in a sample + */ + virtual uint32 getMinSamples() const = 0; + + /** + * Sets the minimum number of examples that should be included in a sample. + * + * @param minSamples The minimum number of examples that should be included in a sample. Must be at least 1 + * @return A reference to an object of type `IExampleWiseStratifiedInstanceSamplingConfig` that + * allows further configuration of the method for sampling instances + */ + virtual IExampleWiseStratifiedInstanceSamplingConfig& setMinSamples(uint32 minSamples) = 0; + + /** + * Returns the maximum number of examples that are included in a sample. + * + * @return The maximum number of examples that are included in a sample + */ + virtual uint32 getMaxSamples() const = 0; + + /** + * Sets the maximum number of examples that should be included in a sample. + * + * @param maxSamples The maximum number of examples that should be included in a sample. Must be at the value + * returned by `getMaxSamples` or 0, if the number of examples should not be restricted + * @return A reference to an object of type `IExampleWiseStratifiedInstanceSamplingConfig` that + * allows further configuration of the method for sampling instances + */ + virtual IExampleWiseStratifiedInstanceSamplingConfig& setMaxSamples(uint32 maxSamples) = 0; }; /** @@ -48,6 +81,10 @@ class ExampleWiseStratifiedInstanceSamplingConfig final : public IClassification float32 sampleSize_; + uint32 minSamples_; + + uint32 maxSamples_; + public: /** @@ -60,6 +97,14 @@ class ExampleWiseStratifiedInstanceSamplingConfig final : public IClassification IExampleWiseStratifiedInstanceSamplingConfig& setSampleSize(float32 sampleSize) override; + uint32 getMinSamples() const override; + + IExampleWiseStratifiedInstanceSamplingConfig& setMinSamples(uint32 minSamples) override; + + uint32 getMaxSamples() const override; + + IExampleWiseStratifiedInstanceSamplingConfig& setMaxSamples(uint32 maxSamples) override; + std::unique_ptr createClassificationInstanceSamplingFactory() const override; }; diff --git a/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_stratified_example_wise.cpp b/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_stratified_example_wise.cpp index c146485607..24ab678930 100644 --- a/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_stratified_example_wise.cpp +++ b/cpp/subprojects/common/src/mlrl/common/sampling/instance_sampling_stratified_example_wise.cpp @@ -19,6 +19,10 @@ class ExampleWiseStratifiedSampling final : public IInstanceSampling { const float32 sampleSize_; + const uint32 minSamples_; + + const uint32 maxSamples_; + BitWeightVector weightVector_; const ExampleWiseStratification stratification_; @@ -36,15 +40,19 @@ class ExampleWiseStratifiedSampling final : public IInstanceSampling { * training set * @param sampleSize The fraction of examples to be included in the sample (e.g. a value of 0.6 * corresponds to 60 % of the available examples). Must be in (0, 1] + * @param minSamples The minimum number of examples to be included in the sample. Must be at least 1 + * @param maxSamples The maximum number of examples to be included in the sample. Must be at least + * `minSamples` or 0, if the number of examples should not be restricted */ ExampleWiseStratifiedSampling(std::unique_ptr rngPtr, const LabelMatrix& labelMatrix, - IndexIterator indicesBegin, IndexIterator indicesEnd, float32 sampleSize) - : sampleSize_(sampleSize), + IndexIterator indicesBegin, IndexIterator indicesEnd, float32 sampleSize, + uint32 minSamples, uint32 maxSamples) + : sampleSize_(sampleSize), minSamples_(minSamples), maxSamples_(maxSamples), weightVector_(labelMatrix.numRows, static_cast(indicesEnd - indicesBegin) < labelMatrix.numRows), stratification_(std::move(rngPtr), labelMatrix, indicesBegin, indicesEnd) {} const IWeightVector& sample() override { - stratification_.sampleWeights(weightVector_, sampleSize_, 1, 0); + stratification_.sampleWeights(weightVector_, sampleSize_, minSamples_, maxSamples_); return weightVector_; } }; @@ -60,6 +68,10 @@ class ExampleWiseStratifiedInstanceSamplingFactory final : public IClassificatio const float32 sampleSize_; + const uint32 minSamples_; + + const uint32 maxSamples_; + public: /** @@ -67,41 +79,50 @@ class ExampleWiseStratifiedInstanceSamplingFactory final : public IClassificatio * generators * @param sampleSize The fraction of examples to be included in the sample (e.g. a value of 0.6 corresponds * to 60 % of the available examples). Must be in (0, 1] + * @param minSamples The minimum number of examples to be included in the sample. Must be at least 1 + * @param maxSamples The maximum number of examples to be included in the sample. Must be at least + * `minSamples` or 0, if the number of examples should not be restricted */ - ExampleWiseStratifiedInstanceSamplingFactory(std::unique_ptr rngFactoryPtr, float32 sampleSize) - : rngFactoryPtr_(std::move(rngFactoryPtr)), sampleSize_(sampleSize) {} + ExampleWiseStratifiedInstanceSamplingFactory(std::unique_ptr rngFactoryPtr, float32 sampleSize, + uint32 minSamples, uint32 maxSamples) + : rngFactoryPtr_(std::move(rngFactoryPtr)), sampleSize_(sampleSize), minSamples_(minSamples), + maxSamples_(maxSamples) {} std::unique_ptr create(const CContiguousView& labelMatrix, const SinglePartition& partition, IStatistics& statistics) const override { return std::make_unique< ExampleWiseStratifiedSampling, SinglePartition::const_iterator>>( - rngFactoryPtr_->create(), labelMatrix, partition.cbegin(), partition.cend(), sampleSize_); + rngFactoryPtr_->create(), labelMatrix, partition.cbegin(), partition.cend(), sampleSize_, minSamples_, + maxSamples_); } std::unique_ptr create(const CContiguousView& labelMatrix, BiPartition& partition, IStatistics& statistics) const override { return std::make_unique< ExampleWiseStratifiedSampling, BiPartition::const_iterator>>( - rngFactoryPtr_->create(), labelMatrix, partition.first_cbegin(), partition.first_cend(), sampleSize_); + rngFactoryPtr_->create(), labelMatrix, partition.first_cbegin(), partition.first_cend(), sampleSize_, + minSamples_, maxSamples_); } std::unique_ptr create(const BinaryCsrView& labelMatrix, const SinglePartition& partition, IStatistics& statistics) const override { return std::make_unique>( - rngFactoryPtr_->create(), labelMatrix, partition.cbegin(), partition.cend(), sampleSize_); + rngFactoryPtr_->create(), labelMatrix, partition.cbegin(), partition.cend(), sampleSize_, minSamples_, + maxSamples_); } std::unique_ptr create(const BinaryCsrView& labelMatrix, BiPartition& partition, IStatistics& statistics) const override { return std::make_unique>( - rngFactoryPtr_->create(), labelMatrix, partition.first_cbegin(), partition.first_cend(), sampleSize_); + rngFactoryPtr_->create(), labelMatrix, partition.first_cbegin(), partition.first_cend(), sampleSize_, + minSamples_, maxSamples_); } }; ExampleWiseStratifiedInstanceSamplingConfig::ExampleWiseStratifiedInstanceSamplingConfig( ReadableProperty rngConfig) - : rngConfig_(rngConfig), sampleSize_(0.66f) {} + : rngConfig_(rngConfig), sampleSize_(0.66f), minSamples_(1), maxSamples_(0) {} float32 ExampleWiseStratifiedInstanceSamplingConfig::getSampleSize() const { return sampleSize_; @@ -115,8 +136,30 @@ IExampleWiseStratifiedInstanceSamplingConfig& ExampleWiseStratifiedInstanceSampl return *this; } +uint32 ExampleWiseStratifiedInstanceSamplingConfig::getMinSamples() const { + return minSamples_; +} + +IExampleWiseStratifiedInstanceSamplingConfig& ExampleWiseStratifiedInstanceSamplingConfig::setMinSamples( + uint32 minSamples) { + util::assertGreaterOrEqual("minSamples", minSamples, 1); + minSamples_ = minSamples; + return *this; +} + +uint32 ExampleWiseStratifiedInstanceSamplingConfig::getMaxSamples() const { + return maxSamples_; +} + +IExampleWiseStratifiedInstanceSamplingConfig& ExampleWiseStratifiedInstanceSamplingConfig::setMaxSamples( + uint32 maxSamples) { + if (maxSamples != 0) util::assertGreaterOrEqual("maxSamples", maxSamples, minSamples_); + maxSamples_ = maxSamples; + return *this; +} + std::unique_ptr ExampleWiseStratifiedInstanceSamplingConfig::createClassificationInstanceSamplingFactory() const { return std::make_unique(rngConfig_.get().createRNGFactory(), - sampleSize_); + sampleSize_, minSamples_, maxSamples_); } diff --git a/python/subprojects/common/mlrl/common/cython/instance_sampling.pxd b/python/subprojects/common/mlrl/common/cython/instance_sampling.pxd index 609067efbc..bc50e81676 100644 --- a/python/subprojects/common/mlrl/common/cython/instance_sampling.pxd +++ b/python/subprojects/common/mlrl/common/cython/instance_sampling.pxd @@ -11,6 +11,14 @@ cdef extern from "mlrl/common/sampling/instance_sampling_stratified_example_wise IExampleWiseStratifiedInstanceSamplingConfig& setSampleSize(float32 sampleSize) + uint32 getMinSamples() const + + IExampleWiseStratifiedInstanceSamplingConfig& setMinSamples(float32 minSamples) + + uint32 getMaxSamples() const + + IExampleWiseStratifiedInstanceSamplingConfig& setMaxSamples(float32 maxSamples) + cdef extern from "mlrl/common/sampling/instance_sampling_stratified_output_wise.hpp" nogil: diff --git a/python/subprojects/common/mlrl/common/cython/instance_sampling.pyx b/python/subprojects/common/mlrl/common/cython/instance_sampling.pyx index 25221a16a1..b7c0ae1b01 100644 --- a/python/subprojects/common/mlrl/common/cython/instance_sampling.pyx +++ b/python/subprojects/common/mlrl/common/cython/instance_sampling.pyx @@ -32,6 +32,48 @@ cdef class ExampleWiseStratifiedInstanceSamplingConfig: self.config_ptr.setSampleSize(sample_size) return self + def get_min_samples(self) -> int: + """ + Returns the minimum number of examples that are included in a sample. + + :return: The minimum number of examples that are included in a sample + """ + return self.config_ptr.getMinSamples() + + def set_min_samples(self, min_samples: int) -> ExampleWiseStratifiedInstanceSamplingConfig: + """ + Sets the minimum number of examples that should be included in a sample. + + :param min_samples: The minimum number of examples that should be included in a sample. Must be at least 1 + :return: An `ExampleWiseStratifiedInstanceSamplingConfig` that allows further configuration of the + method for sampling instances + """ + assert_greater_or_equal('min_samples', min_samples, 1) + self.config_ptr.setMinSamples(min_samples) + return self + + def get_max_samples(self) -> int: + """ + Returns the maximum number of examples that are included in a sample. + + :return: The maximum number of examples that are included in a sample + """ + return self.config_ptr.getMaxSamples() + + def set_max_samples(self, max_samples: int) -> ExampleWiseStratifiedInstanceSamplingConfig: + """ + Sets the maximum number of examples that should be included in a sample. + + :param max_samples: The maximum number of examples that should be included in a sample. Must be at least + `get_min_samples()` or 0, if the number of examples should not be restricted + :return: An `ExampleWiseStratifiedInstanceSamplingConfig` that allows further configuration of the + method for sampling instances + """ + if max_samples != 0: + assert_greater_or_equal('max_samples', max_samples, self.get_min_samples()) + self.config_ptr.setMaxSamples(max_samples) + return self + cdef class OutputWiseStratifiedInstanceSamplingConfig: """ From a0ed8aca8cc298bf780182c230cb3451b70fdc63 Mon Sep 17 00:00:00 2001 From: Michael Rapp Date: Wed, 2 Oct 2024 15:33:11 +0200 Subject: [PATCH 08/12] Adjust expected output of integration tests. --- .../instance-sampling-with-replacement.txt | 32 +++++++++---------- .../instance-sampling-without-replacement.txt | 32 +++++++++---------- .../out/boomer-classifier/pruning-irep.txt | 32 +++++++++---------- .../instance-sampling-with-replacement.txt | 8 ++--- .../instance-sampling-without-replacement.txt | 8 ++--- .../res/out/boomer-regressor/pruning-irep.txt | 8 ++--- .../instance-sampling-with-replacement.txt | 32 +++++++++---------- .../instance-sampling-without-replacement.txt | 28 ++++++++-------- .../res/out/seco-classifier/pruning-irep.txt | 28 ++++++++-------- 9 files changed, 104 insertions(+), 104 deletions(-) diff --git a/python/subprojects/testbed/tests/res/out/boomer-classifier/instance-sampling-with-replacement.txt b/python/subprojects/testbed/tests/res/out/boomer-classifier/instance-sampling-with-replacement.txt index 793f747757..68b7de2ca5 100644 --- a/python/subprojects/testbed/tests/res/out/boomer-classifier/instance-sampling-with-replacement.txt +++ b/python/subprojects/testbed/tests/res/out/boomer-classifier/instance-sampling-with-replacement.txt @@ -12,21 +12,21 @@ DEBUG A dense matrix is used to store the predicted labels INFO Successfully predicted in INFO Evaluation result for test data: -Example-wise F1 60.43 -Example-wise Jaccard 53.15 -Example-wise Precision 78.91 -Example-wise Recall 59.95 -Hamming Accuracy 81.46 -Hamming Loss 18.54 -Macro F1 65.75 -Macro Jaccard 49.93 -Macro Precision 77.16 -Macro Recall 59.52 -Micro F1 67.56 -Micro Jaccard 51.01 -Micro Precision 76.43 -Micro Recall 60.53 -Subset 0/1 Loss 70.41 -Subset Accuracy 29.59 +Example-wise F1 60.71 +Example-wise Jaccard 52.98 +Example-wise Precision 77.3 +Example-wise Recall 61.22 +Hamming Accuracy 80.87 +Hamming Loss 19.13 +Macro F1 64.82 +Macro Jaccard 49.09 +Macro Precision 73.78 +Macro Recall 59.67 +Micro F1 67.06 +Micro Jaccard 50.44 +Micro Precision 74.35 +Micro Recall 61.07 +Subset 0/1 Loss 71.94 +Subset Accuracy 28.06 INFO Successfully finished after diff --git a/python/subprojects/testbed/tests/res/out/boomer-classifier/instance-sampling-without-replacement.txt b/python/subprojects/testbed/tests/res/out/boomer-classifier/instance-sampling-without-replacement.txt index 9fef912e38..0e5d9d4b9d 100644 --- a/python/subprojects/testbed/tests/res/out/boomer-classifier/instance-sampling-without-replacement.txt +++ b/python/subprojects/testbed/tests/res/out/boomer-classifier/instance-sampling-without-replacement.txt @@ -12,21 +12,21 @@ DEBUG A dense matrix is used to store the predicted labels INFO Successfully predicted in INFO Evaluation result for test data: -Example-wise F1 63.28 -Example-wise Jaccard 55.4 -Example-wise Precision 79.42 -Example-wise Recall 63.61 -Hamming Accuracy 82.4 -Hamming Loss 17.6 -Macro F1 67.46 -Macro Jaccard 52.03 -Macro Precision 77.09 -Macro Recall 62.01 -Micro F1 69.78 -Micro Jaccard 53.59 -Micro Precision 77.1 -Micro Recall 63.73 -Subset 0/1 Loss 70.41 -Subset Accuracy 29.59 +Example-wise F1 61.82 +Example-wise Jaccard 54.42 +Example-wise Precision 79.59 +Example-wise Recall 61.14 +Hamming Accuracy 81.63 +Hamming Loss 18.37 +Macro F1 66.4 +Macro Jaccard 50.36 +Macro Precision 77.91 +Macro Recall 59.54 +Micro F1 67.66 +Micro Jaccard 51.13 +Micro Precision 77.13 +Micro Recall 60.27 +Subset 0/1 Loss 69.9 +Subset Accuracy 30.1 INFO Successfully finished after diff --git a/python/subprojects/testbed/tests/res/out/boomer-classifier/pruning-irep.txt b/python/subprojects/testbed/tests/res/out/boomer-classifier/pruning-irep.txt index ce04f271ec..2fefd38044 100644 --- a/python/subprojects/testbed/tests/res/out/boomer-classifier/pruning-irep.txt +++ b/python/subprojects/testbed/tests/res/out/boomer-classifier/pruning-irep.txt @@ -12,21 +12,21 @@ DEBUG A dense matrix is used to store the predicted labels INFO Successfully predicted in INFO Evaluation result for test data: -Example-wise F1 61.11 -Example-wise Jaccard 53.19 -Example-wise Precision 78.66 -Example-wise Recall 61.14 -Hamming Accuracy 80.95 -Hamming Loss 19.05 -Macro F1 65.62 -Macro Jaccard 49.17 -Macro Precision 75.6 -Macro Recall 58.7 -Micro F1 66.77 -Micro Jaccard 50.11 -Micro Precision 75.25 -Micro Recall 60 -Subset 0/1 Loss 71.94 -Subset Accuracy 28.06 +Example-wise F1 60.26 +Example-wise Jaccard 52.21 +Example-wise Precision 79.17 +Example-wise Recall 60.2 +Hamming Accuracy 80.61 +Hamming Loss 19.39 +Macro F1 64.23 +Macro Jaccard 48.09 +Macro Precision 74.63 +Macro Recall 57.38 +Micro F1 65.77 +Micro Jaccard 48.99 +Micro Precision 75.26 +Micro Recall 58.4 +Subset 0/1 Loss 73.98 +Subset Accuracy 26.02 INFO Successfully finished after diff --git a/python/subprojects/testbed/tests/res/out/boomer-regressor/instance-sampling-with-replacement.txt b/python/subprojects/testbed/tests/res/out/boomer-regressor/instance-sampling-with-replacement.txt index f541d2c73e..f5d4673dce 100644 --- a/python/subprojects/testbed/tests/res/out/boomer-regressor/instance-sampling-with-replacement.txt +++ b/python/subprojects/testbed/tests/res/out/boomer-regressor/instance-sampling-with-replacement.txt @@ -12,9 +12,9 @@ DEBUG A dense matrix is used to store the predicted scores INFO Successfully predicted in INFO Evaluation result for test data: -Mean Absolute Error 31.23 -Mean Absolute Percentage Error 1.87 -Mean Squared Error 2088.96 -Median Absolute Error 19.05 +Mean Absolute Error 32.72 +Mean Absolute Percentage Error 2.22 +Mean Squared Error 2226.47 +Median Absolute Error 21.27 INFO Successfully finished after diff --git a/python/subprojects/testbed/tests/res/out/boomer-regressor/instance-sampling-without-replacement.txt b/python/subprojects/testbed/tests/res/out/boomer-regressor/instance-sampling-without-replacement.txt index db363f8074..787c36769c 100644 --- a/python/subprojects/testbed/tests/res/out/boomer-regressor/instance-sampling-without-replacement.txt +++ b/python/subprojects/testbed/tests/res/out/boomer-regressor/instance-sampling-without-replacement.txt @@ -12,9 +12,9 @@ DEBUG A dense matrix is used to store the predicted scores INFO Successfully predicted in INFO Evaluation result for test data: -Mean Absolute Error 29.87 -Mean Absolute Percentage Error 1.72 -Mean Squared Error 1919.64 -Median Absolute Error 20.32 +Mean Absolute Error 31.83 +Mean Absolute Percentage Error 2.01 +Mean Squared Error 2195.87 +Median Absolute Error 21.22 INFO Successfully finished after diff --git a/python/subprojects/testbed/tests/res/out/boomer-regressor/pruning-irep.txt b/python/subprojects/testbed/tests/res/out/boomer-regressor/pruning-irep.txt index 2d99d84ddc..f0446ae921 100644 --- a/python/subprojects/testbed/tests/res/out/boomer-regressor/pruning-irep.txt +++ b/python/subprojects/testbed/tests/res/out/boomer-regressor/pruning-irep.txt @@ -12,9 +12,9 @@ DEBUG A dense matrix is used to store the predicted scores INFO Successfully predicted in INFO Evaluation result for test data: -Mean Absolute Error 32.74 -Mean Absolute Percentage Error 1.69 -Mean Squared Error 2245.15 -Median Absolute Error 19.41 +Mean Absolute Error 33.95 +Mean Absolute Percentage Error 1.95 +Mean Squared Error 2435.84 +Median Absolute Error 21.9 INFO Successfully finished after diff --git a/python/subprojects/testbed/tests/res/out/seco-classifier/instance-sampling-with-replacement.txt b/python/subprojects/testbed/tests/res/out/seco-classifier/instance-sampling-with-replacement.txt index 7a14f893a8..7a1f01437c 100644 --- a/python/subprojects/testbed/tests/res/out/seco-classifier/instance-sampling-with-replacement.txt +++ b/python/subprojects/testbed/tests/res/out/seco-classifier/instance-sampling-with-replacement.txt @@ -12,21 +12,21 @@ DEBUG A dense matrix is used to store the predicted labels INFO Successfully predicted in INFO Evaluation result for test data: -Example-wise F1 53.78 -Example-wise Jaccard 43.94 -Example-wise Precision 57.22 -Example-wise Recall 60.54 -Hamming Accuracy 71.34 -Hamming Loss 28.66 -Macro F1 49.31 -Macro Jaccard 35.42 -Macro Precision 67.16 -Macro Recall 57.9 -Micro F1 57.07 -Micro Jaccard 39.93 -Micro Precision 54.63 -Micro Recall 59.73 -Subset 0/1 Loss 83.16 -Subset Accuracy 16.84 +Example-wise F1 56.48 +Example-wise Jaccard 47.83 +Example-wise Precision 62.63 +Example-wise Recall 61.56 +Hamming Accuracy 73.98 +Hamming Loss 26.02 +Macro F1 52.68 +Macro Jaccard 38.67 +Macro Precision 67.57 +Macro Recall 58.19 +Micro F1 59.74 +Micro Jaccard 42.59 +Micro Precision 58.96 +Micro Recall 60.53 +Subset 0/1 Loss 77.55 +Subset Accuracy 22.45 INFO Successfully finished after diff --git a/python/subprojects/testbed/tests/res/out/seco-classifier/instance-sampling-without-replacement.txt b/python/subprojects/testbed/tests/res/out/seco-classifier/instance-sampling-without-replacement.txt index 8d78631318..fd2079a4a9 100644 --- a/python/subprojects/testbed/tests/res/out/seco-classifier/instance-sampling-without-replacement.txt +++ b/python/subprojects/testbed/tests/res/out/seco-classifier/instance-sampling-without-replacement.txt @@ -12,20 +12,20 @@ DEBUG A dense matrix is used to store the predicted labels INFO Successfully predicted in INFO Evaluation result for test data: -Example-wise F1 52.94 -Example-wise Jaccard 44.28 -Example-wise Precision 63.36 -Example-wise Recall 58.76 -Hamming Accuracy 72.87 -Hamming Loss 27.13 -Macro F1 51.37 -Macro Jaccard 37.24 -Macro Precision 66.09 -Macro Recall 56.01 -Micro F1 57.41 -Micro Jaccard 40.26 -Micro Precision 57.49 -Micro Recall 57.33 +Example-wise F1 54.21 +Example-wise Jaccard 45.35 +Example-wise Precision 62.47 +Example-wise Recall 58.84 +Hamming Accuracy 73.81 +Hamming Loss 26.19 +Macro F1 52.18 +Macro Jaccard 38.1 +Macro Precision 66.87 +Macro Recall 56.51 +Micro F1 58.6 +Micro Jaccard 41.44 +Micro Precision 59.08 +Micro Recall 58.13 Subset 0/1 Loss 80.61 Subset Accuracy 19.39 diff --git a/python/subprojects/testbed/tests/res/out/seco-classifier/pruning-irep.txt b/python/subprojects/testbed/tests/res/out/seco-classifier/pruning-irep.txt index 8d78631318..fd2079a4a9 100644 --- a/python/subprojects/testbed/tests/res/out/seco-classifier/pruning-irep.txt +++ b/python/subprojects/testbed/tests/res/out/seco-classifier/pruning-irep.txt @@ -12,20 +12,20 @@ DEBUG A dense matrix is used to store the predicted labels INFO Successfully predicted in INFO Evaluation result for test data: -Example-wise F1 52.94 -Example-wise Jaccard 44.28 -Example-wise Precision 63.36 -Example-wise Recall 58.76 -Hamming Accuracy 72.87 -Hamming Loss 27.13 -Macro F1 51.37 -Macro Jaccard 37.24 -Macro Precision 66.09 -Macro Recall 56.01 -Micro F1 57.41 -Micro Jaccard 40.26 -Micro Precision 57.49 -Micro Recall 57.33 +Example-wise F1 54.21 +Example-wise Jaccard 45.35 +Example-wise Precision 62.47 +Example-wise Recall 58.84 +Hamming Accuracy 73.81 +Hamming Loss 26.19 +Macro F1 52.18 +Macro Jaccard 38.1 +Macro Precision 66.87 +Macro Recall 56.51 +Micro F1 58.6 +Micro Jaccard 41.44 +Micro Precision 59.08 +Micro Recall 58.13 Subset 0/1 Loss 80.61 Subset Accuracy 19.39 From 5d1b5801aa2e2e070dabdc9d9a5e4dd34c6c2684 Mon Sep 17 00:00:00 2001 From: Michael Rapp Date: Wed, 2 Oct 2024 01:13:11 +0200 Subject: [PATCH 09/12] Add options "min_samples" and "max_samples" for configuring instance sampling methods. --- .../subprojects/common/mlrl/common/config.py | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/python/subprojects/common/mlrl/common/config.py b/python/subprojects/common/mlrl/common/config.py index af9581d862..8c71cbfd9d 100644 --- a/python/subprojects/common/mlrl/common/config.py +++ b/python/subprojects/common/mlrl/common/config.py @@ -42,6 +42,10 @@ OPTION_SAMPLE_SIZE = 'sample_size' +OPTION_MIN_SAMPLES = 'min_samples' + +OPTION_MAX_SAMPLES = 'max_samples' + OPTION_NUM_SAMPLES = 'num_samples' BINNING_EQUAL_FREQUENCY = 'equal-frequency' @@ -418,16 +422,16 @@ def __init__(self): self.add_value(name=NONE, mixin=NoInstanceSamplingMixin) self.add_value(name=SAMPLING_WITH_REPLACEMENT, mixin=InstanceSamplingWithReplacementMixin, - options={OPTION_SAMPLE_SIZE}) + options={OPTION_SAMPLE_SIZE, OPTION_MIN_SAMPLES, OPTION_MAX_SAMPLES}) self.add_value(name=SAMPLING_WITHOUT_REPLACEMENT, mixin=InstanceSamplingWithoutReplacementMixin, - options={OPTION_SAMPLE_SIZE}) + options={OPTION_SAMPLE_SIZE, OPTION_MIN_SAMPLES, OPTION_MAX_SAMPLES}) self.add_value(name=SAMPLING_STRATIFIED_OUTPUT_WISE, mixin=OutputWiseStratifiedInstanceSamplingMixin, - options={OPTION_SAMPLE_SIZE}) + options={OPTION_SAMPLE_SIZE, OPTION_MIN_SAMPLES, OPTION_MAX_SAMPLES}) self.add_value(name=SAMPLING_STRATIFIED_EXAMPLE_WISE, mixin=ExampleWiseStratifiedInstanceSamplingMixin, - options={OPTION_SAMPLE_SIZE}) + options={OPTION_SAMPLE_SIZE, OPTION_MIN_SAMPLES, OPTION_MAX_SAMPLES}) def _configure(self, config, value: str, options: Optional[Options]): if value == NONE: @@ -435,15 +439,23 @@ def _configure(self, config, value: str, options: Optional[Options]): elif value == SAMPLING_WITH_REPLACEMENT: conf = config.use_instance_sampling_with_replacement() conf.set_sample_size(options.get_float(OPTION_SAMPLE_SIZE, conf.get_sample_size())) + conf.set_min_samples(options.get_int(OPTION_MIN_SAMPLES, conf.get_min_samples())) + conf.set_max_samples(options.get_int(OPTION_MAX_SAMPLES, conf.get_max_samples())) elif value == SAMPLING_WITHOUT_REPLACEMENT: conf = config.use_instance_sampling_without_replacement() conf.set_sample_size(options.get_float(OPTION_SAMPLE_SIZE, conf.get_sample_size())) + conf.set_min_samples(options.get_int(OPTION_MIN_SAMPLES, conf.get_min_samples())) + conf.set_max_samples(options.get_int(OPTION_MAX_SAMPLES, conf.get_max_samples())) elif value == SAMPLING_STRATIFIED_OUTPUT_WISE: conf = config.use_output_wise_stratified_instance_sampling() conf.set_sample_size(options.get_float(OPTION_SAMPLE_SIZE, conf.get_sample_size())) + conf.set_min_samples(options.get_int(OPTION_MIN_SAMPLES, conf.get_min_samples())) + conf.set_max_samples(options.get_int(OPTION_MAX_SAMPLES, conf.get_max_samples())) elif value == SAMPLING_STRATIFIED_EXAMPLE_WISE: conf = config.use_example_wise_stratified_instance_sampling() conf.set_sample_size(options.get_float(OPTION_SAMPLE_SIZE, conf.get_sample_size())) + conf.set_min_samples(options.get_int(OPTION_MIN_SAMPLES, conf.get_min_samples())) + conf.set_max_samples(options.get_int(OPTION_MAX_SAMPLES, conf.get_max_samples())) class FeatureSamplingParameter(NominalParameter): From 4cb654fb10112e34d60a2e10839e9c4c7544840a Mon Sep 17 00:00:00 2001 From: Michael Rapp Date: Wed, 2 Oct 2024 01:14:12 +0200 Subject: [PATCH 10/12] Add options "min_samples" and "max_samples" for configuring feature sampling methods. --- python/subprojects/common/mlrl/common/config.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/subprojects/common/mlrl/common/config.py b/python/subprojects/common/mlrl/common/config.py index 8c71cbfd9d..9f47f84385 100644 --- a/python/subprojects/common/mlrl/common/config.py +++ b/python/subprojects/common/mlrl/common/config.py @@ -471,7 +471,7 @@ def __init__(self): self.add_value(name=NONE, mixin=NoFeatureSamplingMixin) self.add_value(name=SAMPLING_WITHOUT_REPLACEMENT, mixin=FeatureSamplingWithoutReplacementMixin, - options={OPTION_SAMPLE_SIZE, self.OPTION_NUM_RETAINED}) + options={OPTION_SAMPLE_SIZE, OPTION_MIN_SAMPLES, OPTION_MAX_SAMPLES, self.OPTION_NUM_RETAINED}) def _configure(self, config, value: str, options: Optional[Options]): if value == NONE: @@ -479,6 +479,8 @@ def _configure(self, config, value: str, options: Optional[Options]): elif value == SAMPLING_WITHOUT_REPLACEMENT: conf = config.use_feature_sampling_without_replacement() conf.set_sample_size(options.get_float(OPTION_SAMPLE_SIZE, conf.get_sample_size())) + conf.set_min_samples(options.get_int(OPTION_MIN_SAMPLES, conf.get_min_samples())) + conf.set_max_samples(options.get_int(OPTION_MAX_SAMPLES, conf.get_max_samples())) conf.set_num_retained(options.get_int(self.OPTION_NUM_RETAINED, conf.get_num_retained())) From 37fd3f261f21308fc08bed93c0abf0421d91afcd Mon Sep 17 00:00:00 2001 From: Michael Rapp Date: Tue, 1 Oct 2024 23:40:44 +0200 Subject: [PATCH 11/12] Update documentation. --- doc/user_guide/boosting/parameters.md | 40 +++++++++++++++++++++++++++ doc/user_guide/seco/parameters.md | 40 +++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/doc/user_guide/boosting/parameters.md b/doc/user_guide/boosting/parameters.md index b626be2c61..c1ccbcc8b4 100644 --- a/doc/user_guide/boosting/parameters.md +++ b/doc/user_guide/boosting/parameters.md @@ -427,6 +427,14 @@ The seed to be used by random number generators. The given value must be at leas The percentage of features to be included in a sample. For example, a value of 0.6 corresponds to 60% of the features. The given value must be in (0, 1\] or 0, if the sample size should be calculated as log2(A - 1) + 1), where A denotes the number of available features. + - `min_samples` *(Default value = `1`)* + + The minimum number of features to be included in a sample. The given value must be at least 1. + + - `max_samples` *(Default value = `0`)* + + The maximum number of features to be included in a sample. The given value must be at least the value of `min_samples` or 0, if the number of features should not be restricted. + - `num_retained` *(Default value = `0`)* The number of trailing features to be always included in a sample. For example, a value of 2 means that the last two features are always retained. @@ -449,6 +457,14 @@ The seed to be used by random number generators. The given value must be at leas The percentage of examples to be included in a sample. For example, a value of 0.6 corresponds to 60% of the available examples. The given value must be in the range (0, 1). + - `min_samples` *(Default value = `1`)* + + The minimum number of examples to be included in a sample. The given value must be at least 1. + + - `max_samples` *(Default value = `0`)* + + The maximum number of examples to be included in a sample. The given value must be at least the value of `min_samples` or 0, if the number of examples should not be restricted. + `'without-replacement'` The training examples to be considered for learning a new rule are selected randomly without replacement. The following options may be provided using the {ref}`bracket notation`: @@ -456,6 +472,14 @@ The seed to be used by random number generators. The given value must be at leas The percentage of examples to be included in a sample. For example, a value of 0.6 corresponds to 60% of the available examples. The given value must be in the range (0, 1). + - `min_samples` *(Default value = `1`)* + + The minimum number of examples to be included in a sample. The given value must be at least 1. + + - `max_samples` *(Default value = `0`)* + + The maximum number of examples to be included in a sample. The given value must be at least the value of `min_samples` or 0, if the number of examples should not be restricted. + `'stratified-output-wise'` *(classification only)* The training examples to be considered for learning a new rule are selected according to an iterative stratified sampling method that ensures that for each label the proportion of relevant and irrelevant examples is maintained. The following options may be provided using the {ref}`bracket notation`: @@ -463,12 +487,28 @@ The seed to be used by random number generators. The given value must be at leas The percentage of examples to be included in a sample. For example, a value of 0.6 corresponds to 60% of the available examples. The given value must be in the range (0, 1). + - `min_samples` *(Default value = `1`)* + + The minimum number of examples to be included in a sample. The given value must be at least 1. + + - `max_samples` *(Default value = `0`)* + + The maximum number of examples to be included in a sample. The given value must be at least the value of `min_samples` or 0, if the number of examples should not be restricted. + `'stratified-example-wise'` (*classification only*) The training examples to be considered for learning a new rule are selected according to stratified sampling method, where distinct label vectors are treated as individual classes. The following options may be provided using the {ref}`bracket notation`: - `sample_size` *(Default value = `0.66`)* The percentage of examples to be included in a sample. For example, a value of 0.6 corresponds to 60% of the available examples. The given value must be in the range (0, 1). + + - `min_samples` *(Default value = `1`)* + + The minimum number of examples to be included in a sample. The given value must be at least 1. + + - `max_samples` *(Default value = `0`)* + + The maximum number of examples to be included in a sample. The given value must be at least the value of `min_samples` or 0, if the number of examples should not be restricted. ``` ## Approximations and Optimizations diff --git a/doc/user_guide/seco/parameters.md b/doc/user_guide/seco/parameters.md index 7f2cd1aea8..5791f21b3e 100644 --- a/doc/user_guide/seco/parameters.md +++ b/doc/user_guide/seco/parameters.md @@ -357,6 +357,14 @@ The seed to be used by random number generators. The given value must be at leas The percentage of features to be included in a sample. For example, a value of 0.6 corresponds to 60% of the features. The given value must be in (0, 1\] or 0, if the sample size should be calculated as log2(A - 1) + 1), where A denotes the number of available features. + - `min_samples` *(Default value = `1`)* + + The minimum number of features to be included in a sample. The given value must be at least 1. + + - `max_samples` *(Default value = `0`)* + + The maximum number of features to be included in a sample. The given value must be at least the value of `min_samples` or 0, if the number of features should not be restricted. + - `num_retained` *(Default value = `0`)* The number of trailing features to be always included in a sample. For example, a value of 2 means that the last two features are always retained. @@ -379,6 +387,14 @@ The seed to be used by random number generators. The given value must be at leas The percentage of examples to be included in a sample. For example, a value of 0.6 corresponds to 60% of the available examples. The given value must be in the range (0, 1). + - `min_samples` *(Default value = `1`)* + + The minimum number of examples to be included in a sample. The given value must be at least 1. + + - `max_samples` *(Default value = `0`)* + + The maximum number of examples to be included in a sample. The given value must be at least the value of `min_samples` or 0, if the number of examples should not be restricted. + `'without-replacement'` The training examples to be considered for learning a new rule are selected randomly without replacement. The following options may be provided using the {ref}`bracket notation`: @@ -386,6 +402,14 @@ The seed to be used by random number generators. The given value must be at leas The percentage of examples to be included in a sample. For example, a value of 0.6 corresponds to 60% of the available examples. The given value must be in the range (0, 1). + - `min_samples` *(Default value = `1`)* + + The minimum number of examples to be included in a sample. The given value must be at least 1. + + - `max_samples` *(Default value = `0`)* + + The maximum number of examples to be included in a sample. The given value must be at least the value of `min_samples` or 0, if the number of examples should not be restricted. + `'stratified-output-wise'` The training examples to be considered for learning a new rule are selected according to an iterative stratified sampling method that ensures that for each label the proportion of relevant and irrelevant examples is maintained. The following options may be provided using the {ref}`bracket notation`: @@ -393,12 +417,28 @@ The seed to be used by random number generators. The given value must be at leas The percentage of examples to be included in a sample. For example, a value of 0.6 corresponds to 60% of the available examples. The given value must be in the range (0, 1). + - `min_samples` *(Default value = `1`)* + + The minimum number of examples to be included in a sample. The given value must be at least 1. + + - `max_samples` *(Default value = `0`)* + + The maximum number of examples to be included in a sample. The given value must be at least the value of `min_samples` or 0, if the number of examples should not be restricted. + `'stratified-example-wise'` The training examples to be considered for learning a new rule are selected according to stratified sampling method, where distinct label vectors are treated as individual classes. The following options may be provided using the {ref}`bracket notation`: - `sample_size` *(Default value = `0.66`)* The percentage of examples to be included in a sample. For example, a value of 0.6 corresponds to 60% of the available examples. The given value must be in the range (0, 1). + + - `min_samples` *(Default value = `1`)* + + The minimum number of examples to be included in a sample. The given value must be at least 1. + + - `max_samples` *(Default value = `0`)* + + The maximum number of examples to be included in a sample. The given value must be at least the value of `min_samples` or 0, if the number of examples should not be restricted. ``` ## Approximations and Optimizations From 38d13987ec3a3ae43cf4fe9621d863bb48823a39 Mon Sep 17 00:00:00 2001 From: Michael Rapp Date: Wed, 2 Oct 2024 01:22:12 +0200 Subject: [PATCH 12/12] Update changelog. --- CHANGELOG.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 661b584349..dcf89b9ae1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,14 @@ tocdepth: 2 A major update to the BOOMER algorithm that comes with the following changes. +```{warning} +This release comes with several API changes. For an updated overview of the available parameters and command line arguments, please refer to the [documentation](https://mlrl-boomer.readthedocs.io/en/0.11.0/). +``` + +### API Changes + +- The options `min_samples` and `max_samples` have been added to the values of the command line arguments `--feature-sampling` and `--instance-sampling`. + ### Quality-of-Life Improvements - C++ 20 is now required for compiling the project.