Skip to content

Commit

Permalink
Merge pull request #1088 from mrapp-ke/output-sampling-options
Browse files Browse the repository at this point in the history
Unify options for configuring sampling methods
  • Loading branch information
michael-rapp authored Oct 2, 2024
2 parents 07d5329 + e6418cf commit 75d2c91
Show file tree
Hide file tree
Showing 8 changed files with 179 additions and 40 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ This release comes with several API changes. For an updated overview of the avai
### API Changes

- The options `min_samples` and `max_samples` have been added to the values of the command line arguments `--feature-sampling` and `--instance-sampling`.
- Similar to other sampling methods, the options `sample_size`, `min_samples`, and `max_samples` can now be specified via the command line argument `--feature-sampling` when set to the value `without-replacement`.

### Quality-of-Life Improvements

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,54 @@ class MLRLCOMMON_API IOutputSamplingWithoutReplacementConfig {
virtual ~IOutputSamplingWithoutReplacementConfig() {}

/**
* Returns the number of outputs that are included in a sample.
* Returns the fraction of outputs that are included in a sample.
*
* @return The number of outputs that are included in a sample
* @return The fraction of outputs that are included in a sample
*/
virtual uint32 getNumSamples() const = 0;
virtual float32 getSampleSize() const = 0;

/**
* Sets the number of outputs that should be included in a sample.
* Sets the fraction of outputs that should be included in a sample.
*
* @param numSamples The number of outputs that should be included in a sample. Must be at least 1
* @param sampleSize The fraction of outputs that should be included in a sample, e.g., a value of 0.6
* corresponds to 60 % of the available outputs. Must be in (0, 1)
* @return A reference to an object of type `IOutputSamplingWithoutReplacementConfig` that allows
* further configuration of the sampling method
* further configuration of the method for sampling instances
*/
virtual IOutputSamplingWithoutReplacementConfig& setNumSamples(uint32 numSamples) = 0;
virtual IOutputSamplingWithoutReplacementConfig& setSampleSize(float32 sampleSize) = 0;

/**
* Returns the minimum number of outputs that are included in a sample.
*
* @return The minimum number of outputs that are included in a sample
*/
virtual uint32 getMinSamples() const = 0;

/**
* Sets the minimum number of outputs that should be included in a sample.
*
* @param minSamples The minimum number of outputs that should be included in a sample. Must be at least 1
* @return A reference to an object of type `IOutputSamplingWithoutReplacementConfig` that allows
* further configuration of the method for sampling instances
*/
virtual IOutputSamplingWithoutReplacementConfig& setMinSamples(uint32 minSamples) = 0;

/**
* Returns the maximum number of outputs that are included in a sample.
*
* @return The maximum number of outputs that are included in a sample
*/
virtual uint32 getMaxSamples() const = 0;

/**
* Sets the maximum number of outputs that should be included in a sample.
*
* @param maxSamples The maximum number of outputs that should be included in a sample. Must be at the value
* returned by `getMaxSamples` or 0, if the number of outputs should not be restricted
* @return A reference to an object of type `IOutputSamplingWithoutReplacementConfig` that allows
* further configuration of the method for sampling instances
*/
virtual IOutputSamplingWithoutReplacementConfig& setMaxSamples(uint32 maxSamples) = 0;
};

/**
Expand All @@ -42,7 +76,11 @@ class OutputSamplingWithoutReplacementConfig final : public IOutputSamplingConfi

const ReadableProperty<RNGConfig> rngConfig_;

uint32 numSamples_;
float32 sampleSize_;

uint32 minSamples_;

uint32 maxSamples_;

public:

Expand All @@ -52,9 +90,17 @@ class OutputSamplingWithoutReplacementConfig final : public IOutputSamplingConfi
*/
OutputSamplingWithoutReplacementConfig(ReadableProperty<RNGConfig> rngConfig);

uint32 getNumSamples() const override;
float32 getSampleSize() const override;

IOutputSamplingWithoutReplacementConfig& setSampleSize(float32 sampleSize) override;

uint32 getMinSamples() const override;

IOutputSamplingWithoutReplacementConfig& setMinSamples(uint32 minSamples) override;

uint32 getMaxSamples() const override;

IOutputSamplingWithoutReplacementConfig& setNumSamples(uint32 numSamples) override;
IOutputSamplingWithoutReplacementConfig& setMaxSamples(uint32 maxSamples) override;

std::unique_ptr<IOutputSamplingFactory> createOutputSamplingFactory(
const IOutputMatrix& outputMatrix) const override;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "index_sampling.hpp"
#include "mlrl/common/indices/index_vector_partial.hpp"
#include "mlrl/common/iterator/iterator_index.hpp"
#include "mlrl/common/util/math.hpp"
#include "mlrl/common/util/validation.hpp"

/**
Expand Down Expand Up @@ -67,20 +68,43 @@ class OutputSamplingWithoutReplacementFactory final : public IOutputSamplingFact
};

OutputSamplingWithoutReplacementConfig::OutputSamplingWithoutReplacementConfig(ReadableProperty<RNGConfig> rngConfig)
: rngConfig_(rngConfig), numSamples_(1) {}
: rngConfig_(rngConfig), sampleSize_(0.33f), minSamples_(1), maxSamples_(1) {}

uint32 OutputSamplingWithoutReplacementConfig::getNumSamples() const {
return numSamples_;
float32 OutputSamplingWithoutReplacementConfig::getSampleSize() const {
return sampleSize_;
}

IOutputSamplingWithoutReplacementConfig& OutputSamplingWithoutReplacementConfig::setNumSamples(uint32 numSamples) {
util::assertGreaterOrEqual<uint32>("numSamples", numSamples, 1);
numSamples_ = numSamples;
IOutputSamplingWithoutReplacementConfig& OutputSamplingWithoutReplacementConfig::setSampleSize(float32 sampleSize) {
util::assertGreater<float32>("sampleSize", sampleSize, 0);
util::assertLess<float32>("sampleSize", sampleSize, 1);
sampleSize_ = sampleSize;
return *this;
}

uint32 OutputSamplingWithoutReplacementConfig::getMinSamples() const {
return minSamples_;
}

IOutputSamplingWithoutReplacementConfig& OutputSamplingWithoutReplacementConfig::setMinSamples(uint32 minSamples) {
util::assertGreaterOrEqual<uint32>("minSamples", minSamples, 1);
minSamples_ = minSamples;
return *this;
}

uint32 OutputSamplingWithoutReplacementConfig::getMaxSamples() const {
return maxSamples_;
}

IOutputSamplingWithoutReplacementConfig& OutputSamplingWithoutReplacementConfig::setMaxSamples(uint32 maxSamples) {
if (maxSamples != 0) util::assertGreaterOrEqual<uint32>("maxSamples", maxSamples, minSamples_);
maxSamples_ = maxSamples;
return *this;
}

std::unique_ptr<IOutputSamplingFactory> OutputSamplingWithoutReplacementConfig::createOutputSamplingFactory(
const IOutputMatrix& outputMatrix) const {
uint32 numOutputs = outputMatrix.getNumOutputs();
uint32 numSamples = util::calculateBoundedFraction(numOutputs, sampleSize_, minSamples_, maxSamples_);
return std::make_unique<OutputSamplingWithoutReplacementFactory>(rngConfig_.get().createRNGFactory(),
outputMatrix.getNumOutputs(), numSamples_);
outputMatrix.getNumOutputs(), numSamples);
}
12 changes: 10 additions & 2 deletions doc/user_guide/boosting/parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -405,9 +405,17 @@ The seed to be used by random number generators. The given value must be at leas
`'without-replacement'`
The outputs to be considered when learning a new rule are chosen randomly. The following options may be provided using the {ref}`bracket notation<bracket-notation>`:
- `num_samples` *(Default value = `1`)*
- `sample_size` *(Default value = `0.33`)*
The number of outputs to be included in a sample. The given value must be at least 1.
The percentage of outputs to be included in a sample. For example, a value of 0.6 corresponds to 60% of the outputs. The given value must be in (0, 1\].
- `min_samples` *(Default value = `1`)*
The minimum number of outputs to be included in a sample. The given value must be at least 1.
- `max_samples` *(Default value = `1`)*
The maximum number of outputs to be included in a sample. The given value must be at least the value of `min_samples` or 0, if the number of outputs should not be restricted.
```

(boosting_parameters_feature_sampling)=
Expand Down
12 changes: 10 additions & 2 deletions doc/user_guide/seco/parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -335,9 +335,17 @@ The seed to be used by random number generators. The given value must be at leas
`'without-replacement'`
The outputs to be considered when learning a new rule are chosen randomly. The following options may be provided using the {ref}`bracket notation<bracket-notation>`:
- `num_samples` *(Default value = `1`)*
- `sample_size` *(Default value = `0.33`)*
The number of outputs to be included in a sample. The given value must be at least 1.
The percentage of outputs to be included in a sample. For example, a value of 0.6 corresponds to 60% of the outputs. The given value must be in (0, 1\].
- `min_samples` *(Default value = `1`)*
The minimum number of outputs to be included in a sample. The given value must be at least 1.
- `max_samples` *(Default value = `1`)*
The maximum number of outputs to be included in a sample. The given value must be at least the value of `min_samples` or 0, if the number of outputs should not be restricted.
```

(seco_parameters_feature_sampling)=
Expand Down
8 changes: 4 additions & 4 deletions python/subprojects/common/mlrl/common/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@

OPTION_MAX_SAMPLES = 'max_samples'

OPTION_NUM_SAMPLES = 'num_samples'

BINNING_EQUAL_FREQUENCY = 'equal-frequency'

BINNING_EQUAL_WIDTH = 'equal-width'
Expand Down Expand Up @@ -398,15 +396,17 @@ def __init__(self):
self.add_value(name=NONE, mixin=NoFeatureSamplingMixin)
self.add_value(name=SAMPLING_WITHOUT_REPLACEMENT,
mixin=FeatureSamplingWithoutReplacementMixin,
options={OPTION_NUM_SAMPLES})
options={OPTION_SAMPLE_SIZE, OPTION_MIN_SAMPLES, OPTION_MAX_SAMPLES})
self.add_value(name=self.OUTPUT_SAMPLING_ROUND_ROBIN, mixin=RoundRobinOutputSamplingMixin)

def _configure(self, config, value: str, options: Optional[Options]):
if value == NONE:
config.use_no_output_sampling()
elif value == SAMPLING_WITHOUT_REPLACEMENT:
conf = config.use_output_sampling_without_replacement()
conf.set_num_samples(options.get_int(OPTION_NUM_SAMPLES, conf.get_num_samples()))
conf.set_sample_size(options.get_float(OPTION_SAMPLE_SIZE, conf.get_sample_size()))
conf.set_min_samples(options.get_int(OPTION_MIN_SAMPLES, conf.get_min_samples()))
conf.set_max_samples(options.get_int(OPTION_MAX_SAMPLES, conf.get_max_samples()))
elif value == self.OUTPUT_SAMPLING_ROUND_ROBIN:
config.use_round_robin_output_sampling()

Expand Down
14 changes: 11 additions & 3 deletions python/subprojects/common/mlrl/common/cython/output_sampling.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from mlrl.common.cython._types cimport uint32
from mlrl.common.cython._types cimport float32, uint32


cdef extern from "mlrl/common/sampling/output_sampling_without_replacement.hpp" nogil:
Expand All @@ -7,9 +7,17 @@ cdef extern from "mlrl/common/sampling/output_sampling_without_replacement.hpp"

# Functions:

uint32 getNumSamples() const
float32 getSampleSize() const

IOutputSamplingWithoutReplacementConfig& setNumSamples(uint32 numSamples) except +
IOutputSamplingWithoutReplacementConfig& setSampleSize(float32 sampleSize)

uint32 getMinSamples() const

IOutputSamplingWithoutReplacementConfig& setMinSamples(float32 minSamples)

uint32 getMaxSamples() const

IOutputSamplingWithoutReplacementConfig& setMaxSamples(float32 maxSamples)


cdef class OutputSamplingWithoutReplacementConfig:
Expand Down
68 changes: 56 additions & 12 deletions python/subprojects/common/mlrl/common/cython/output_sampling.pyx
Original file line number Diff line number Diff line change
@@ -1,30 +1,74 @@
"""
@author: Michael Rapp ([email protected])
"""
from mlrl.common.cython.validation import assert_greater_or_equal
from mlrl.common.cython.validation import assert_greater, assert_greater_or_equal, assert_less


cdef class OutputSamplingWithoutReplacementConfig:
"""
Allows to configure a method for sampling outputs without replacement.
"""

def get_num_samples(self) -> int:
def get_sample_size(self) -> float:
"""
Returns the number of outputs that are included in a sample.
Returns the fraction of outputs that are included in a sample.

:return: The number of outputs that are included in a sample
:return: The fraction of outputs that are included in a sample
"""
return self.config_ptr.getNumSamples()
return self.config_ptr.getSampleSize()

def set_num_samples(self, num_samples: int) -> OutputSamplingWithoutReplacementConfig:
def set_sample_size(self, sample_size: float) -> OutputSamplingWithoutReplacementConfig:
"""
Sets the number of outputs that should be included in a sample.
Sets the fraction of outputs that should be included in a sample.

:param num_samples: The number of outputs that should be included in a sample. Must be at least 1
:return: An `OutputSamplingWithoutReplacementConfig` that allows further configuration of the
sampling method
:param sample_size: The fraction of outputs that should be included in a sample, e.g., a value of 0.6
corresponds to 60 % of the available outputs. Must be in (0, 1)
:return: An `OutputSamplingWithoutReplacementConfig` that allows further configuration of the method
for sampling outputs
"""
assert_greater_or_equal('num_samples', num_samples, 1)
self.config_ptr.setNumSamples(num_samples)
assert_greater('sample_size', sample_size, 0)
assert_less('sample_size', sample_size, 1)
self.config_ptr.setSampleSize(sample_size)
return self

def get_min_samples(self) -> int:
"""
Returns the minimum number of outputs that are included in a sample.

:return: The minimum number of outputs that are included in a sample
"""
return self.config_ptr.getMinSamples()

def set_min_samples(self, min_samples: int) -> OutputSamplingWithoutReplacementConfig:
"""
Sets the minimum number of outputs that should be included in a sample.

:param min_samples: The minimum number of outputs that should be included in a sample. Must be at least 1
:return: An `OutputSamplingWithoutReplacementConfig` that allows further configuration of the method
for sampling outputs
"""
assert_greater_or_equal('min_samples', min_samples, 1)
self.config_ptr.setMinSamples(min_samples)
return self

def get_max_samples(self) -> int:
"""
Returns the maximum number of outputs that are included in a sample.

:return: The maximum number of outputs that are included in a sample
"""
return self.config_ptr.getMaxSamples()

def set_max_samples(self, max_samples: int) -> OutputSamplingWithoutReplacementConfig:
"""
Sets the maximum number of outputs that should be included in a sample.

:param max_samples: The maximum number of outputs that should be included in a sample. Must be at least
`get_min_samples()` or 0, if the number of outputs should not be restricted
:return: An `OutputSamplingWithoutReplacementConfig` that allows further configuration of the method
for sampling outputs
"""
if max_samples != 0:
assert_greater_or_equal('max_samples', max_samples, self.get_min_samples())
self.config_ptr.setMaxSamples(max_samples)
return self

0 comments on commit 75d2c91

Please sign in to comment.