diff --git a/CHANGELOG.md b/CHANGELOG.md index a761c5c109..53898ff777 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ This release comes with several API changes. For an updated overview of the avai ### Quality-of-Life Improvements +- The implementation of feature binning has been reworked in a way that helps avoiding redundant code. - The documentation has been updated to a more modern theme supporting light and dark theme variants. - A build option that allows to disable multi-threading support via OpenMP at compile-time has been added. - The groundwork for GPU support was laid. It can be disabled at compile-time via a build option. diff --git a/cpp/subprojects/boosting/include/mlrl/boosting/binning/feature_binning_auto.hpp b/cpp/subprojects/boosting/include/mlrl/boosting/binning/feature_binning_auto.hpp deleted file mode 100644 index 0bc3a36d59..0000000000 --- a/cpp/subprojects/boosting/include/mlrl/boosting/binning/feature_binning_auto.hpp +++ /dev/null @@ -1,35 +0,0 @@ -/* - * @author Michael Rapp (michael.rapp.ml@gmail.com) - */ -#pragma once - -#include "mlrl/common/binning/feature_binning.hpp" -#include "mlrl/common/multi_threading/multi_threading.hpp" - -namespace boosting { - - /** - * Allows to configure a method that automatically decides whether feature binning should be used or not. - */ - class AutomaticFeatureBinningConfig final : public IFeatureBinningConfig { - private: - - const std::unique_ptr& multiThreadingConfigPtr_; - - public: - - /** - * @param multiThreadingConfigPtr A reference to an unique pointer that stores the configuration of the - * multi-threading behavior that should be used for the parallel update of - * statistics - */ - AutomaticFeatureBinningConfig(const std::unique_ptr& multiThreadingConfigPtr); - - /** - * @see `IFeatureBinningConfig::createThresholdsFactory` - */ - std::unique_ptr createThresholdsFactory(const IFeatureMatrix& featureMatrix, - const ILabelMatrix& labelMatrix) const override; - }; - -} diff --git a/cpp/subprojects/boosting/include/mlrl/boosting/data/vector_statistic_label_wise_sparse.hpp b/cpp/subprojects/boosting/include/mlrl/boosting/data/vector_statistic_label_wise_sparse.hpp index 06e983d3b7..a8bbe8b601 100644 --- a/cpp/subprojects/boosting/include/mlrl/boosting/data/vector_statistic_label_wise_sparse.hpp +++ b/cpp/subprojects/boosting/include/mlrl/boosting/data/vector_statistic_label_wise_sparse.hpp @@ -3,7 +3,7 @@ */ #pragma once -#include "mlrl/boosting/data/view_histogram_label_wise_sparse.hpp" +#include "mlrl/common/data/triple.hpp" #include "mlrl/common/data/tuple.hpp" #include "mlrl/common/data/view_matrix_sparse_set.hpp" #include "mlrl/common/indices/index_vector_complete.hpp" @@ -266,56 +266,6 @@ namespace boosting { void addToSubset(const SparseSetView>& view, uint32 row, const PartialIndexVector& indices, float64 weight); - /** - * Adds certain gradients and Hessians in a single row of a `SparseLabelWiseHistogramView`, whose positions - * are given as a `CompleteIndexVector`, to this vector. - * - * @param view A reference to an object of type `SparseLabelWiseHistogramView` that stores the - * gradients and Hessians to be added to this vector - * @param row The index of the row to be added to this vector - * @param indices A reference to a `CompleteIndexVector' that provides access to the indices - */ - void addToSubset(const SparseLabelWiseHistogramView& view, uint32 row, const CompleteIndexVector& indices); - - /** - * Adds certain gradients and Hessians in a single row of a `SparseLabelWiseHistogramView`, whose positions - * are given as a `PartialIndexVector`, to this vector. - * - * @param view A reference to an object of type `SparseLabelWiseHistogramView` that stores the - * gradients and Hessians to be added to this vector - * @param row The index of the row to be added to this vector - * @param indices A reference to a `PartialIndexVector' that provides access to the indices - */ - void addToSubset(const SparseLabelWiseHistogramView& view, uint32 row, const PartialIndexVector& indices); - - /** - * Adds certain gradients and Hessians in a single row of a `SparseLabelWiseHistogramView`, whose positions - * are given as a `CompleteIndexVector`, to this vector. The gradients and Hessians to be added are - * multiplied by a specific weight. - * - * @param view A reference to an object of type `SparseLabelWiseHistogramView` that stores the - * gradients and Hessians to be added to this vector - * @param row The index of the row to be added to this vector - * @param indices A reference to a `CompleteIndexVector' that provides access to the indices - * @param weight The weight, the gradients and Hessians should be multiplied by - */ - void addToSubset(const SparseLabelWiseHistogramView& view, uint32 row, const CompleteIndexVector& indices, - float64 weight); - - /** - * Adds certain gradients and Hessians in a single row of a `SparseLabelWiseHistogramView`, whose positions - * are given as a `PartialIndexVector`, to this vector. The gradients and Hessians to be added are - * multiplied by a specific weight. - * - * @param view A reference to an object of type `SparseLabelWiseHistogramView` that stores the - * gradients and Hessians to be added to this vector - * @param row The index of the row to be added to this vector - * @param indices A reference to a `PartialIndexVector' that provides access to the indices - * @param weight The weight, the gradients and Hessians should be multiplied by - */ - void addToSubset(const SparseLabelWiseHistogramView& view, uint32 row, const PartialIndexVector& indices, - float64 weight); - /** * Sets the gradients and Hessians in this vector to the difference `first - second` between the gradients * and Hessians in two other vectors, considering only the gradients and Hessians in the first vector that diff --git a/cpp/subprojects/boosting/include/mlrl/boosting/data/view_histogram_label_wise_sparse.hpp b/cpp/subprojects/boosting/include/mlrl/boosting/data/view_histogram_label_wise_sparse.hpp deleted file mode 100644 index 9f631706c4..0000000000 --- a/cpp/subprojects/boosting/include/mlrl/boosting/data/view_histogram_label_wise_sparse.hpp +++ /dev/null @@ -1,77 +0,0 @@ -/* - * @author Michael Rapp (michael.rapp.ml@gmail.com) - */ -#pragma once - -#include "mlrl/boosting/util/dll_exports.hpp" -#include "mlrl/common/data/triple.hpp" -#include "mlrl/common/data/view_matrix_c_contiguous.hpp" -#include "mlrl/common/data/view_matrix_composite.hpp" -#include "mlrl/common/data/view_vector.hpp" - -namespace boosting { - - /** - * Implements row-wise read and write access to the gradients and Hessians that have been calculated using a - * label-wise decomposable loss function and are stored in a pre-allocated histogram in the list of lists (LIL) - * format. - */ - class MLRLBOOSTING_API SparseLabelWiseHistogramView - : public CompositeMatrix>, AllocatedVector> { - public: - - /** - * @param numRows The number of rows in the view - * @param numCols The number of columns in the view - */ - SparseLabelWiseHistogramView(uint32 numRows, uint32 numCols); - - /** - * @param other A reference to an object of type `SparseLabelWiseHistogramView` that should be copied - */ - SparseLabelWiseHistogramView(SparseLabelWiseHistogramView&& other); - - virtual ~SparseLabelWiseHistogramView() override {} - - /** - * An iterator that provides read-only access to the gradients and Hessians. - */ - typedef typename AllocatedCContiguousView>::value_const_iterator value_const_iterator; - - /** - * An iterator that provides read-only access to the weights that correspond to individual bins. - */ - typedef typename AllocatedVector::const_iterator weight_const_iterator; - - /** - * Returns a `const_iterator` to the beginning of the gradients and Hessians at a specific row. - * - * @param row The index of the row - * @return A `const_iterator` to the beginning of the row - */ - value_const_iterator values_cbegin(uint32 row) const; - - /** - * Returns a `const_iterator` to the end of the gradients and Hessians at a specific row. - * - * @param row The index of the row - * @return A `const_iterator` to the end of the row - */ - value_const_iterator values_cend(uint32 row) const; - - /** - * Returns a `weight_const_iterator` to the beginning of the weights that correspond to individual bins. - * - * @return A `weight_const_iterator` to the beginning - */ - weight_const_iterator weights_cbegin() const; - - /** - * Returns a `weight_const_iterator` to the end of the weights that correspond to individual bins. - * - * @return A `weight_const_iterator` to the end - */ - weight_const_iterator weights_cend() const; - }; - -} diff --git a/cpp/subprojects/boosting/include/mlrl/boosting/input/feature_binning_auto.hpp b/cpp/subprojects/boosting/include/mlrl/boosting/input/feature_binning_auto.hpp new file mode 100644 index 0000000000..679ddb88f5 --- /dev/null +++ b/cpp/subprojects/boosting/include/mlrl/boosting/input/feature_binning_auto.hpp @@ -0,0 +1,23 @@ +/* + * @author Michael Rapp (michael.rapp.ml@gmail.com) + */ +#pragma once + +#include "mlrl/common/input/feature_binning.hpp" + +namespace boosting { + + /** + * Allows to configure a method that automatically decides whether feature binning should be used or not. + */ + class AutomaticFeatureBinningConfig final : public IFeatureBinningConfig { + public: + + /** + * @see `IFeatureBinningConfig::createFeatureBinningFactory` + */ + std::unique_ptr createFeatureBinningFactory( + const IFeatureMatrix& featureMatrix, const ILabelMatrix& labelMatrix) const override; + }; + +} diff --git a/cpp/subprojects/boosting/include/mlrl/boosting/learner.hpp b/cpp/subprojects/boosting/include/mlrl/boosting/learner.hpp index 05fab90082..eb369e70ad 100644 --- a/cpp/subprojects/boosting/include/mlrl/boosting/learner.hpp +++ b/cpp/subprojects/boosting/include/mlrl/boosting/learner.hpp @@ -8,10 +8,10 @@ #pragma warning(disable : 4250) #endif -#include "mlrl/boosting/binning/feature_binning_auto.hpp" #include "mlrl/boosting/binning/label_binning_auto.hpp" #include "mlrl/boosting/binning/label_binning_equal_width.hpp" #include "mlrl/boosting/binning/label_binning_no.hpp" +#include "mlrl/boosting/input/feature_binning_auto.hpp" #include "mlrl/boosting/losses/loss_example_wise_logistic.hpp" #include "mlrl/boosting/losses/loss_example_wise_squared_error.hpp" #include "mlrl/boosting/losses/loss_example_wise_squared_hinge.hpp" @@ -157,8 +157,7 @@ namespace boosting { virtual void useAutomaticFeatureBinning() { std::unique_ptr& featureBinningConfigPtr = this->getFeatureBinningConfigPtr(); - featureBinningConfigPtr = - std::make_unique(this->getParallelStatisticUpdateConfigPtr()); + featureBinningConfigPtr = std::make_unique(); } }; diff --git a/cpp/subprojects/boosting/meson.build b/cpp/subprojects/boosting/meson.build index e97c1bc0b3..7dc83cd983 100644 --- a/cpp/subprojects/boosting/meson.build +++ b/cpp/subprojects/boosting/meson.build @@ -2,7 +2,6 @@ project('boosting', 'cpp') # Source files source_files = [ - 'src/mlrl/boosting/binning/feature_binning_auto.cpp', 'src/mlrl/boosting/binning/label_binning_auto.cpp', 'src/mlrl/boosting/binning/label_binning_equal_width.cpp', 'src/mlrl/boosting/binning/label_binning_no.cpp', @@ -11,8 +10,8 @@ source_files = [ 'src/mlrl/boosting/data/vector_statistic_example_wise_dense.cpp', 'src/mlrl/boosting/data/vector_statistic_label_wise_dense.cpp', 'src/mlrl/boosting/data/vector_statistic_label_wise_sparse.cpp', - 'src/mlrl/boosting/data/view_histogram_label_wise_sparse.cpp', 'src/mlrl/boosting/data/view_statistic_example_wise_dense.cpp', + 'src/mlrl/boosting/input/feature_binning_auto.cpp', 'src/mlrl/boosting/losses/loss_example_wise_logistic.cpp', 'src/mlrl/boosting/losses/loss_example_wise_squared_error.cpp', 'src/mlrl/boosting/losses/loss_example_wise_squared_hinge.cpp', diff --git a/cpp/subprojects/boosting/src/mlrl/boosting/binning/feature_binning_auto.cpp b/cpp/subprojects/boosting/src/mlrl/boosting/binning/feature_binning_auto.cpp deleted file mode 100644 index a32b5a9999..0000000000 --- a/cpp/subprojects/boosting/src/mlrl/boosting/binning/feature_binning_auto.cpp +++ /dev/null @@ -1,22 +0,0 @@ -#include "mlrl/boosting/binning/feature_binning_auto.hpp" - -#include "mlrl/common/binning/feature_binning_equal_width.hpp" -#include "mlrl/common/binning/feature_binning_no.hpp" - -namespace boosting { - - AutomaticFeatureBinningConfig::AutomaticFeatureBinningConfig( - const std::unique_ptr& multiThreadingConfigPtr) - : multiThreadingConfigPtr_(multiThreadingConfigPtr) {} - - std::unique_ptr AutomaticFeatureBinningConfig::createThresholdsFactory( - const IFeatureMatrix& featureMatrix, const ILabelMatrix& labelMatrix) const { - if (!featureMatrix.isSparse() && featureMatrix.getNumExamples() > 200000) { - return EqualWidthFeatureBinningConfig(multiThreadingConfigPtr_) - .createThresholdsFactory(featureMatrix, labelMatrix); - } else { - return NoFeatureBinningConfig(multiThreadingConfigPtr_).createThresholdsFactory(featureMatrix, labelMatrix); - } - } - -} diff --git a/cpp/subprojects/boosting/src/mlrl/boosting/data/vector_statistic_label_wise_sparse.cpp b/cpp/subprojects/boosting/src/mlrl/boosting/data/vector_statistic_label_wise_sparse.cpp index cee5f6c3e9..22b83e0297 100644 --- a/cpp/subprojects/boosting/src/mlrl/boosting/data/vector_statistic_label_wise_sparse.cpp +++ b/cpp/subprojects/boosting/src/mlrl/boosting/data/vector_statistic_label_wise_sparse.cpp @@ -164,50 +164,6 @@ namespace boosting { } } - void SparseLabelWiseStatisticVector::addToSubset(const SparseLabelWiseHistogramView& view, uint32 row, - const CompleteIndexVector& indices) { - SparseLabelWiseHistogramView::weight_const_iterator weightIterator = view.weights_cbegin(); - float64 binWeight = weightIterator[row]; - - if (binWeight != 0) { - sumOfWeights_ += binWeight; - addToView(this->view.begin(), view.values_cbegin(row), this->getNumElements()); - } - } - - void SparseLabelWiseStatisticVector::addToSubset(const SparseLabelWiseHistogramView& view, uint32 row, - const PartialIndexVector& indices) { - SparseLabelWiseHistogramView::weight_const_iterator weightIterator = view.weights_cbegin(); - float64 binWeight = weightIterator[row]; - - if (binWeight != 0) { - sumOfWeights_ += binWeight; - addToView(this->view.begin(), view.values_cbegin(row), indices.cbegin(), indices.getNumElements()); - } - } - - void SparseLabelWiseStatisticVector::addToSubset(const SparseLabelWiseHistogramView& view, uint32 row, - const CompleteIndexVector& indices, float64 weight) { - SparseLabelWiseHistogramView::weight_const_iterator weightIterator = view.weights_cbegin(); - float64 binWeight = weightIterator[row] * weight; - - if (binWeight != 0) { - sumOfWeights_ += binWeight; - addToView(this->view.begin(), view.values_cbegin(row), this->getNumElements(), weight); - } - } - - void SparseLabelWiseStatisticVector::addToSubset(const SparseLabelWiseHistogramView& view, uint32 row, - const PartialIndexVector& indices, float64 weight) { - SparseLabelWiseHistogramView::weight_const_iterator weightIterator = view.weights_cbegin(); - float64 binWeight = weightIterator[row] * weight; - - if (binWeight != 0) { - sumOfWeights_ += binWeight; - addToView(this->view.begin(), view.values_cbegin(row), indices.cbegin(), indices.getNumElements(), weight); - } - } - void SparseLabelWiseStatisticVector::difference(const SparseLabelWiseStatisticVector& first, const CompleteIndexVector& firstIndices, const SparseLabelWiseStatisticVector& second) { diff --git a/cpp/subprojects/boosting/src/mlrl/boosting/data/view_histogram_label_wise_sparse.cpp b/cpp/subprojects/boosting/src/mlrl/boosting/data/view_histogram_label_wise_sparse.cpp deleted file mode 100644 index 8f483cad1a..0000000000 --- a/cpp/subprojects/boosting/src/mlrl/boosting/data/view_histogram_label_wise_sparse.cpp +++ /dev/null @@ -1,28 +0,0 @@ -#include "mlrl/boosting/data/view_histogram_label_wise_sparse.hpp" - -namespace boosting { - - SparseLabelWiseHistogramView::SparseLabelWiseHistogramView(uint32 numRows, uint32 numCols) - : CompositeMatrix>, AllocatedVector>( - AllocatedCContiguousView>(numRows, numCols), AllocatedVector(numRows, true), numRows, - numCols) {} - - SparseLabelWiseHistogramView::SparseLabelWiseHistogramView(SparseLabelWiseHistogramView&& other) - : CompositeMatrix>, AllocatedVector>(std::move(other)) {} - - SparseLabelWiseHistogramView::value_const_iterator SparseLabelWiseHistogramView::values_cbegin(uint32 row) const { - return CompositeView::firstView.values_cbegin(row); - } - - SparseLabelWiseHistogramView::value_const_iterator SparseLabelWiseHistogramView::values_cend(uint32 row) const { - return CompositeView::firstView.values_cend(row); - } - - SparseLabelWiseHistogramView::weight_const_iterator SparseLabelWiseHistogramView::weights_cbegin() const { - return CompositeView::secondView.cbegin(); - } - - SparseLabelWiseHistogramView::weight_const_iterator SparseLabelWiseHistogramView::weights_cend() const { - return CompositeView::secondView.cend(); - } -} diff --git a/cpp/subprojects/boosting/src/mlrl/boosting/input/feature_binning_auto.cpp b/cpp/subprojects/boosting/src/mlrl/boosting/input/feature_binning_auto.cpp new file mode 100644 index 0000000000..f3e1b4e0e3 --- /dev/null +++ b/cpp/subprojects/boosting/src/mlrl/boosting/input/feature_binning_auto.cpp @@ -0,0 +1,17 @@ +#include "mlrl/boosting/input/feature_binning_auto.hpp" + +#include "mlrl/common/input/feature_binning_equal_width.hpp" +#include "mlrl/common/input/feature_binning_no.hpp" + +namespace boosting { + + std::unique_ptr AutomaticFeatureBinningConfig::createFeatureBinningFactory( + const IFeatureMatrix& featureMatrix, const ILabelMatrix& labelMatrix) const { + if (!featureMatrix.isSparse() && featureMatrix.getNumExamples() > 200000) { + return EqualWidthFeatureBinningConfig().createFeatureBinningFactory(featureMatrix, labelMatrix); + } else { + return NoFeatureBinningConfig().createFeatureBinningFactory(featureMatrix, labelMatrix); + } + } + +} diff --git a/cpp/subprojects/boosting/src/mlrl/boosting/statistics/statistics_example_wise_common.hpp b/cpp/subprojects/boosting/src/mlrl/boosting/statistics/statistics_example_wise_common.hpp index cb00f7ed39..5a050485d7 100644 --- a/cpp/subprojects/boosting/src/mlrl/boosting/statistics/statistics_example_wise_common.hpp +++ b/cpp/subprojects/boosting/src/mlrl/boosting/statistics/statistics_example_wise_common.hpp @@ -4,7 +4,6 @@ #pragma once #include "mlrl/boosting/statistics/statistics_example_wise.hpp" -#include "mlrl/common/binning/bin_weight_vector.hpp" namespace boosting { @@ -290,181 +289,6 @@ namespace boosting { } }; - /** - * Provides access to gradients and Hessians that are calculated according to a differentiable loss function that is - * applied example-wise and are organized as a histogram. - * - * @tparam StatisticVector The type of the vectors that are used to store gradients and Hessians - * @tparam StatisticView The type of the view that provides access to the original gradients and Hessians - * @tparam Histogram The type of a histogram that stores aggregated gradients and Hessians - * @tparam RuleEvaluationFactory The type of the factory that allows to create instances of the class that is - * used for calculating the predictions of rules, as well as corresponding quality - * scores - * @tparam BinIndexVector The type of the vector that stores the indices of the bins, individual examples - * have been assigned to - * @tparam WeightVector The type of the vector that provides access to the weights of individual - * statistics - */ - template - class ExampleWiseHistogram final - : virtual public IHistogram, - public AbstractExampleWiseImmutableWeightedStatistics { - private: - - /** - * Provides access to a subset of the gradients and Hessians that are stored by an instance of the class - * `ExampleWiseHistogram`. - * - * @tparam IndexVector The type of the vector that provides access to the indices of the labels that are - * included in the subset - */ - template - class WeightedStatisticsSubset final - : public AbstractExampleWiseImmutableWeightedStatistics< - StatisticVector, typename Histogram::view_type, RuleEvaluationFactory, - BinWeightVector>::template AbstractWeightedStatisticsSubset { - private: - - const ExampleWiseHistogram& histogram_; - - std::unique_ptr totalCoverableSumVectorPtr_; - - public: - - /** - * @param histogram A reference to an object of type `ExampleWiseHistogram` that stores the - * gradients and Hessians - * @param totalSumVector A reference to an object of template type `StatisticVector` that stores - * the total sums of gradients and Hessians - * @param labelIndices A reference to an object of template type `IndexVector` that provides - * access to the indices of the labels that are included in the subset - */ - WeightedStatisticsSubset(const ExampleWiseHistogram& histogram, - const StatisticVector& totalSumVector, const IndexVector& labelIndices) - : AbstractExampleWiseImmutableWeightedStatistics< - StatisticVector, typename Histogram::view_type, RuleEvaluationFactory, - BinWeightVector>::template AbstractWeightedStatisticsSubset(histogram, - totalSumVector, - labelIndices), - histogram_(histogram) {} - - /** - * @see `IWeightedStatisticsSubset::addToMissing` - */ - void addToMissing(uint32 statisticIndex) override { - // Create a vector for storing the totals sums of gradients and Hessians, if necessary... - if (!totalCoverableSumVectorPtr_) { - totalCoverableSumVectorPtr_ = std::make_unique(*this->totalSumVector_); - this->totalSumVector_ = totalCoverableSumVectorPtr_.get(); - } - - // Subtract the gradients and Hessians of the example at the given index (weighted by the given - // weight) from the total sums of gradients and Hessians... - removeExampleWiseStatistic(histogram_.originalWeights_, histogram_.originalStatisticView_, - *totalCoverableSumVectorPtr_, statisticIndex); - } - }; - - const std::unique_ptr histogramPtr_; - - const std::unique_ptr binWeightVectorPtr_; - - const BinIndexVector& binIndexVector_; - - const StatisticView& originalStatisticView_; - - const WeightVector& originalWeights_; - - const StatisticVector& totalSumVector_; - - public: - - /** - * @param histogramPtr An unique pointer to an object of template type `Histogram` that stores the - * gradients and Hessians in the histogram - * @param binWeightVectorPtr An unique pointer to an object of type `BinWeightVector` that stores the - * weights of individual bins - * @param binIndexVector A reference to an object of template type `BinIndexVector` that stores the - * indices of the bins, individual examples have been assigned to - * @param originalStatisticView A reference to an object of template type `StatisticView` that provides - * access to the original gradients and Hessians, the histogram was created - * from - * @param originalWeights A reference to an object of template type `WeightVector` that provides - * access to the weights of the original statistics, the histogram was created - * from - * @param totalSumVector A reference to an object of template type `StatisticVector` that stores the - * total sums of gradients and Hessians - * @param ruleEvaluationFactory A reference to an object of type `RuleEvaluationFactory` that allows to - * create instances of the class that should be used for calculating the - * predictions of rules, as well as their overall quality - */ - ExampleWiseHistogram(std::unique_ptr histogramPtr, - std::unique_ptr binWeightVectorPtr, - const BinIndexVector& binIndexVector, const StatisticView& originalStatisticView, - const WeightVector& originalWeights, const StatisticVector& totalSumVector, - const RuleEvaluationFactory& ruleEvaluationFactory) - : AbstractExampleWiseImmutableWeightedStatistics( - histogramPtr->getView(), ruleEvaluationFactory, *binWeightVectorPtr), - histogramPtr_(std::move(histogramPtr)), binWeightVectorPtr_(std::move(binWeightVectorPtr)), - binIndexVector_(binIndexVector), originalStatisticView_(originalStatisticView), - originalWeights_(originalWeights), totalSumVector_(totalSumVector) {} - - /** - * @see `IHistogram::clear` - */ - void clear() override { - histogramPtr_->clear(); - binWeightVectorPtr_->clear(); - } - - /** - * @see `IHistogram::getBinWeight` - */ - uint32 getBinWeight(uint32 binIndex) const override { - return (*binWeightVectorPtr_)[binIndex]; - } - - /** - * @see `IHistogram::addToBin` - */ - void addToBin(uint32 statisticIndex) override { - float64 weight = originalWeights_[statisticIndex]; - - if (weight > 0) { - uint32 binIndex = binIndexVector_.getBinIndex(statisticIndex); - - if (binIndex != IBinIndexVector::BIN_INDEX_SPARSE) { - binWeightVectorPtr_->increaseWeight(binIndex); - histogramPtr_->addToRow(binIndex, originalStatisticView_.gradients_cbegin(statisticIndex), - originalStatisticView_.gradients_cend(statisticIndex), - originalStatisticView_.hessians_cbegin(statisticIndex), - originalStatisticView_.hessians_cend(statisticIndex), weight); - } - } - } - - /** - * @see `IImmutableWeightedStatistics::createSubset` - */ - std::unique_ptr createSubset( - const CompleteIndexVector& labelIndices) const override { - return std::make_unique>(*this, totalSumVector_, - labelIndices); - } - - /** - * @see `IImmutableWeightedStatistics::createSubset` - */ - std::unique_ptr createSubset( - const PartialIndexVector& labelIndices) const override { - return std::make_unique>(*this, totalSumVector_, - labelIndices); - } - }; - template static inline void addExampleWiseStatistic(const WeightVector& weights, const StatisticView& statisticView, StatisticVector& statisticVector, uint32 statisticIndex) { @@ -499,20 +323,6 @@ namespace boosting { statisticView.hessians_cbegin(statisticIndex), statisticView.hessians_cend(statisticIndex)); } - template - static inline std::unique_ptr createExampleWiseHistogramInternally( - const BinIndexVector& binIndexVector, const StatisticView& originalStatisticView, - const WeightVector& originalWeights, const StatisticVector& totalSumVector, - const RuleEvaluationFactory& ruleEvaluationFactory, uint32 numBins) { - std::unique_ptr histogramPtr = std::make_unique(numBins, originalStatisticView.numCols); - std::unique_ptr binWeightVectorPtr = std::make_unique(numBins); - return std::make_unique>( - std::move(histogramPtr), std::move(binWeightVectorPtr), binIndexVector, originalStatisticView, - originalWeights, totalSumVector, ruleEvaluationFactory); - } - /** * Provides access to weighted gradients and Hessians that are calculated according to a differentiable loss * function that is applied example-wise and allows to update the gradients and Hessians after a new rule has been @@ -520,15 +330,13 @@ namespace boosting { * * @tparam StatisticVector The type of the vectors that are used to store gradients and Hessians * @tparam StatisticView The type of the view that provides access to the gradients and Hessians - * @tparam Histogram The type of a histogram that stores aggregated gradients and Hessians * @tparam RuleEvaluationFactory The type of the factory that allows to create instances of the class that is * used for calculating the predictions of rules, as well as corresponding quality * scores * @tparam WeightVector The type of the vector that provides access to the weights of individual * statistics */ - template + template class ExampleWiseWeightedStatistics final : virtual public IWeightedStatistics, public AbstractExampleWiseImmutableWeightedStatistics copy() const override { - return std::make_unique>(*this); + return std::make_unique< + ExampleWiseWeightedStatistics>( + *this); } /** @@ -651,28 +460,6 @@ namespace boosting { removeExampleWiseStatistic(this->weights_, this->statisticView_, *totalSumVectorPtr_, statisticIndex); } - /** - * @see `IWeightedStatistics::createHistogram` - */ - std::unique_ptr createHistogram(const DenseBinIndexVector& binIndexVector, - uint32 numBins) const override { - return createExampleWiseHistogramInternally( - binIndexVector, this->statisticView_, this->weights_, *totalSumVectorPtr_, - this->ruleEvaluationFactory_, numBins); - } - - /** - * @see `IWeightedStatistics::createHistogram` - */ - std::unique_ptr createHistogram(const DokBinIndexVector& binIndexVector, - uint32 numBins) const override { - return createExampleWiseHistogramInternally( - binIndexVector, this->statisticView_, this->weights_, *totalSumVectorPtr_, - this->ruleEvaluationFactory_, numBins); - } - /** * @see `IImmutableWeightedStatistics::createSubset` */ @@ -707,7 +494,6 @@ namespace boosting { * training examples * @tparam StatisticVector The type of the vectors that are used to store gradients and Hessians * @tparam StatisticMatrix The type of the matrix that stores the gradients and Hessians - * @tparam Histogram The type of a histogram that stores aggregated gradients and Hessians * @tparam ScoreMatrix The type of the matrices that are used to store predicted scores * @tparam LossFunction The type of the loss function that is used to calculate gradients and * Hessians @@ -720,9 +506,9 @@ namespace boosting { * that is used for calculating the label-wise predictions of rules, as * well as their overall quality */ - template + template class AbstractExampleWiseStatistics : virtual public IExampleWiseStatistics { private: @@ -996,7 +782,7 @@ namespace boosting { std::unique_ptr createWeightedStatistics( const EqualWeightVector& weights) const override final { return std::make_unique< - ExampleWiseWeightedStatistics>( statisticMatrixPtr_->getView(), *ruleEvaluationFactory_, weights); } @@ -1007,7 +793,7 @@ namespace boosting { std::unique_ptr createWeightedStatistics( const BitWeightVector& weights) const override final { return std::make_unique< - ExampleWiseWeightedStatistics>( statisticMatrixPtr_->getView(), *ruleEvaluationFactory_, weights); } @@ -1018,7 +804,7 @@ namespace boosting { std::unique_ptr createWeightedStatistics( const DenseWeightVector& weights) const override final { return std::make_unique< - ExampleWiseWeightedStatistics>>( statisticMatrixPtr_->getView(), *ruleEvaluationFactory_, weights); } diff --git a/cpp/subprojects/boosting/src/mlrl/boosting/statistics/statistics_label_wise_common.hpp b/cpp/subprojects/boosting/src/mlrl/boosting/statistics/statistics_label_wise_common.hpp index 1442066a0e..934f2bf8c6 100644 --- a/cpp/subprojects/boosting/src/mlrl/boosting/statistics/statistics_label_wise_common.hpp +++ b/cpp/subprojects/boosting/src/mlrl/boosting/statistics/statistics_label_wise_common.hpp @@ -4,7 +4,6 @@ #pragma once #include "mlrl/boosting/statistics/statistics_label_wise.hpp" -#include "mlrl/common/binning/bin_weight_vector.hpp" namespace boosting { @@ -276,179 +275,6 @@ namespace boosting { } }; - /** - * Provides access to gradients and Hessians that are calculated according to a differentiable loss function that is - * applied label-wise and are organized as a histogram. - * - * @tparam StatisticVector The type of the vectors that are used to store gradients and Hessians - * @tparam StatisticView The type of the view that provides access to the original gradients and Hessians - * @tparam Histogram The type of a histogram that stores aggregated gradients and Hessians - * @tparam RuleEvaluationFactory The type of the factory that allows to create instances of the class that is - * used for calculating the predictions of rules, as well as corresponding quality - * scores - * @tparam BinIndexVector The type of the vector that stores the indices of the bins, individual examples - * have been assigned to - * @tparam WeightVector The type of the vector that provides access to the weights of individual - * statistics - */ - template - class LabelWiseHistogram final - : virtual public IHistogram, - public AbstractLabelWiseImmutableWeightedStatistics { - private: - - /** - * Provides access to a subset of the gradients and Hessians that are stored by an instance of the class - * `LabelWiseHistogram`. - * - * @tparam IndexVector The type of the vector that provides access to the indices of the labels that are - * included in the subset - */ - template - class WeightedStatisticsSubset final - : public AbstractLabelWiseImmutableWeightedStatistics< - StatisticVector, typename Histogram::view_type, RuleEvaluationFactory, - BinWeightVector>::template AbstractWeightedStatisticsSubset { - private: - - const LabelWiseHistogram& histogram_; - - std::unique_ptr totalCoverableSumVectorPtr_; - - public: - - /** - * @param histogram A reference to an object of type `LabelWiseHistogram` that stores the - * gradients and Hessians - * @param totalSumVector A reference to an object of template type `StatisticVector` that stores - * the total sums of gradients and Hessians - * @param labelIndices A reference to an object of template type `IndexVector` that provides - * access to the indices of the labels that are included in the subset - */ - WeightedStatisticsSubset(const LabelWiseHistogram& histogram, const StatisticVector& totalSumVector, - const IndexVector& labelIndices) - : AbstractLabelWiseImmutableWeightedStatistics< - StatisticVector, typename Histogram::view_type, RuleEvaluationFactory, - BinWeightVector>::template AbstractWeightedStatisticsSubset(histogram, - totalSumVector, - labelIndices), - histogram_(histogram) {} - - /** - * @see `IWeightedStatisticsSubset::addToMissing` - */ - void addToMissing(uint32 statisticIndex) override { - // Create a vector for storing the totals sums of gradients and Hessians, if necessary... - if (!totalCoverableSumVectorPtr_) { - totalCoverableSumVectorPtr_ = std::make_unique(*this->totalSumVector_); - this->totalSumVector_ = totalCoverableSumVectorPtr_.get(); - } - - // Subtract the gradients and Hessians of the example at the given index (weighted by the given - // weight) from the total sums of gradients and Hessians... - removeLabelWiseStatistic(histogram_.originalWeights_, histogram_.originalStatisticView_, - *totalCoverableSumVectorPtr_, statisticIndex); - } - }; - - const std::unique_ptr histogramPtr_; - - const std::unique_ptr binWeightVectorPtr_; - - const BinIndexVector& binIndexVector_; - - const StatisticView& originalStatisticView_; - - const WeightVector& originalWeights_; - - const StatisticVector& totalSumVector_; - - public: - - /** - * @param histogramPtr An unique pointer to an object of template type `Histogram` that stores the - * gradients and Hessians in the histogram - * @param binWeightVectorPtr An unique pointer to an object of type `BinWeightVector` that stores the - * weights of individual bins - * @param binIndexVector A reference to an object of template type `BinIndexVector` that stores the - * indices of the bins, individual examples have been assigned to - * @param originalStatisticView A reference to an object of template type `StatisticView` that provides - * access to the original gradients and Hessians, the histogram was created - * from - * @param originalWeights A reference to an object of template type `WeightVector` that provides - * access to the weights of the original statistics, the histogram was created - * from - * @param totalSumVector A reference to an object of template type `StatisticVector` that stores the - * total sums of gradients and Hessians - * @param ruleEvaluationFactory A reference to an object of type `RuleEvaluationFactory` that allows to - * create instances of the class that should be used for calculating the - * predictions of rules, as well as their overall quality - */ - LabelWiseHistogram(std::unique_ptr histogramPtr, - std::unique_ptr binWeightVectorPtr, - const BinIndexVector& binIndexVector, const StatisticView& originalStatisticView, - const WeightVector& originalWeights, const StatisticVector& totalSumVector, - const RuleEvaluationFactory& ruleEvaluationFactory) - : AbstractLabelWiseImmutableWeightedStatistics( - histogramPtr->getView(), ruleEvaluationFactory, *binWeightVectorPtr), - histogramPtr_(std::move(histogramPtr)), binWeightVectorPtr_(std::move(binWeightVectorPtr)), - binIndexVector_(binIndexVector), originalStatisticView_(originalStatisticView), - originalWeights_(originalWeights), totalSumVector_(totalSumVector) {} - - /** - * @see `IHistogram::clear` - */ - void clear() override { - histogramPtr_->clear(); - binWeightVectorPtr_->clear(); - } - - /** - * @see `IHistogram::getBinWeight` - */ - uint32 getBinWeight(uint32 binIndex) const override { - return (*binWeightVectorPtr_)[binIndex]; - } - - /** - * @see `IHistogram::addToBin` - */ - void addToBin(uint32 statisticIndex) override { - float64 weight = originalWeights_[statisticIndex]; - - if (weight > 0) { - uint32 binIndex = binIndexVector_.getBinIndex(statisticIndex); - - if (binIndex != IBinIndexVector::BIN_INDEX_SPARSE) { - binWeightVectorPtr_->increaseWeight(binIndex); - histogramPtr_->addToRow(binIndex, originalStatisticView_.values_cbegin(statisticIndex), - originalStatisticView_.values_cend(statisticIndex), weight); - } - } - } - - /** - * @see `IImmutableWeightedStatistics::createSubset` - */ - std::unique_ptr createSubset( - const CompleteIndexVector& labelIndices) const override { - return std::make_unique>(*this, totalSumVector_, - labelIndices); - } - - /** - * @see `IImmutableWeightedStatistics::createSubset` - */ - std::unique_ptr createSubset( - const PartialIndexVector& labelIndices) const override { - return std::make_unique>(*this, totalSumVector_, - labelIndices); - } - }; - template static inline void addLabelWiseStatistic(const EqualWeightVector& weights, const StatisticView& statisticView, StatisticVector& statisticVector, uint32 statisticIndex) { @@ -475,20 +301,6 @@ namespace boosting { statisticVector.remove(statisticView, statisticIndex, weight); } - template - static inline std::unique_ptr createLabelWiseHistogramInternally( - const BinIndexVector& binIndexVector, const StatisticView& originalStatisticView, - const WeightVector& originalWeights, const StatisticVector& totalSumVector, - const RuleEvaluationFactory& ruleEvaluationFactory, uint32 numBins) { - std::unique_ptr histogramPtr = std::make_unique(numBins, originalStatisticView.numCols); - std::unique_ptr binWeightVectorPtr = std::make_unique(numBins); - return std::make_unique>( - std::move(histogramPtr), std::move(binWeightVectorPtr), binIndexVector, originalStatisticView, - originalWeights, totalSumVector, ruleEvaluationFactory); - } - /** * Provides access to weighted gradients and Hessians that are calculated according to a differentiable loss * function that is applied label-wise and allows to update the gradients and Hessians after a new rule has been @@ -496,15 +308,13 @@ namespace boosting { * * @tparam StatisticVector The type of the vectors that are used to store gradients and Hessians * @tparam StatisticView The type of the view that provides access to the gradients and Hessians - * @tparam Histogram The type of a histogram that stores aggregated gradients and Hessians * @tparam RuleEvaluationFactory The type of the factory that allows to create instances of the class that is * used for calculating the predictions of rules, as well as corresponding quality * scores * @tparam WeightVector The type of the vector that provides access to the weights of individual * statistics */ - template + template class LabelWiseWeightedStatistics final : virtual public IWeightedStatistics, public AbstractLabelWiseImmutableWeightedStatistics copy() const override { - return std::make_unique>(*this); + return std::make_unique< + LabelWiseWeightedStatistics>( + *this); } /** @@ -626,28 +437,6 @@ namespace boosting { removeLabelWiseStatistic(this->weights_, this->statisticView_, *totalSumVectorPtr_, statisticIndex); } - /** - * @see `IWeightedStatistics::createHistogram` - */ - std::unique_ptr createHistogram(const DenseBinIndexVector& binIndexVector, - uint32 numBins) const override { - return createLabelWiseHistogramInternally( - binIndexVector, this->statisticView_, this->weights_, *totalSumVectorPtr_, - this->ruleEvaluationFactory_, numBins); - } - - /** - * @see `IWeightedStatistics::createHistogram` - */ - std::unique_ptr createHistogram(const DokBinIndexVector& binIndexVector, - uint32 numBins) const override { - return createLabelWiseHistogramInternally( - binIndexVector, this->statisticView_, this->weights_, *totalSumVectorPtr_, - this->ruleEvaluationFactory_, numBins); - } - /** * @see `IImmutableWeightedStatistics::createSubset` */ @@ -698,7 +487,6 @@ namespace boosting { * examples * @tparam StatisticVector The type of the vectors that are used to store gradients and Hessians * @tparam StatisticMatrix The type of the matrix that provides access to the gradients and Hessians - * @tparam Histogram The type of a histogram that stores aggregated gradients and Hessians * @tparam ScoreMatrix The type of the matrices that are used to store predicted scores * @tparam LossFunction The type of the loss function that is used to calculate gradients and Hessians * @tparam EvaluationMeasure The type of the evaluation measure that is used to assess the quality of @@ -707,8 +495,8 @@ namespace boosting { * used for calculating the predictions of rules, as well as corresponding quality * scores */ - template + template class AbstractLabelWiseStatistics : virtual public ILabelWiseStatistics { private: @@ -969,9 +757,8 @@ namespace boosting { */ std::unique_ptr createWeightedStatistics( const EqualWeightVector& weights) const override final { - return std::make_unique< - LabelWiseWeightedStatistics>( + return std::make_unique>( statisticMatrixPtr_->getView(), *ruleEvaluationFactory_, weights); } @@ -980,9 +767,8 @@ namespace boosting { */ std::unique_ptr createWeightedStatistics( const BitWeightVector& weights) const override final { - return std::make_unique< - LabelWiseWeightedStatistics>( + return std::make_unique>( statisticMatrixPtr_->getView(), *ruleEvaluationFactory_, weights); } @@ -992,7 +778,7 @@ namespace boosting { std::unique_ptr createWeightedStatistics( const DenseWeightVector& weights) const override final { return std::make_unique< - LabelWiseWeightedStatistics>>( statisticMatrixPtr_->getView(), *ruleEvaluationFactory_, weights); } diff --git a/cpp/subprojects/boosting/src/mlrl/boosting/statistics/statistics_label_wise_dense.hpp b/cpp/subprojects/boosting/src/mlrl/boosting/statistics/statistics_label_wise_dense.hpp index 24c8ca3202..9cff061d36 100644 --- a/cpp/subprojects/boosting/src/mlrl/boosting/statistics/statistics_label_wise_dense.hpp +++ b/cpp/subprojects/boosting/src/mlrl/boosting/statistics/statistics_label_wise_dense.hpp @@ -51,8 +51,8 @@ namespace boosting { template class DenseLabelWiseStatistics final : public AbstractLabelWiseStatistics, - ILabelWiseLoss, IEvaluationMeasure, ILabelWiseRuleEvaluationFactory> { + NumericCContiguousMatrix, ILabelWiseLoss, IEvaluationMeasure, + ILabelWiseRuleEvaluationFactory> { public: /** @@ -78,8 +78,8 @@ namespace boosting { std::unique_ptr statisticMatrixPtr, std::unique_ptr> scoreMatrixPtr) : AbstractLabelWiseStatistics, - ILabelWiseLoss, IEvaluationMeasure, ILabelWiseRuleEvaluationFactory>( + NumericCContiguousMatrix, ILabelWiseLoss, IEvaluationMeasure, + ILabelWiseRuleEvaluationFactory>( std::move(lossPtr), std::move(evaluationMeasurePtr), ruleEvaluationFactory, labelMatrix, std::move(statisticMatrixPtr), std::move(scoreMatrixPtr)) {} diff --git a/cpp/subprojects/boosting/src/mlrl/boosting/statistics/statistics_provider_example_wise_dense.cpp b/cpp/subprojects/boosting/src/mlrl/boosting/statistics/statistics_provider_example_wise_dense.cpp index e15b67a040..9be6024469 100644 --- a/cpp/subprojects/boosting/src/mlrl/boosting/statistics/statistics_provider_example_wise_dense.cpp +++ b/cpp/subprojects/boosting/src/mlrl/boosting/statistics/statistics_provider_example_wise_dense.cpp @@ -60,9 +60,9 @@ namespace boosting { template class DenseExampleWiseStatistics final : public AbstractExampleWiseStatistics, IExampleWiseLoss, IEvaluationMeasure, - IExampleWiseRuleEvaluationFactory, ILabelWiseRuleEvaluationFactory> { + DenseExampleWiseStatisticMatrix, NumericCContiguousMatrix, + IExampleWiseLoss, IEvaluationMeasure, IExampleWiseRuleEvaluationFactory, + ILabelWiseRuleEvaluationFactory> { public: /** @@ -88,9 +88,9 @@ namespace boosting { std::unique_ptr statisticMatrixPtr, std::unique_ptr> scoreMatrixPtr) : AbstractExampleWiseStatistics, IExampleWiseLoss, IEvaluationMeasure, - IExampleWiseRuleEvaluationFactory, ILabelWiseRuleEvaluationFactory>( + DenseExampleWiseStatisticMatrix, NumericCContiguousMatrix, + IExampleWiseLoss, IEvaluationMeasure, IExampleWiseRuleEvaluationFactory, + ILabelWiseRuleEvaluationFactory>( std::move(lossPtr), std::move(evaluationMeasurePtr), ruleEvaluationFactory, labelMatrix, std::move(statisticMatrixPtr), std::move(scoreMatrixPtr)) {} diff --git a/cpp/subprojects/boosting/src/mlrl/boosting/statistics/statistics_provider_label_wise_sparse.cpp b/cpp/subprojects/boosting/src/mlrl/boosting/statistics/statistics_provider_label_wise_sparse.cpp index 13be528a5b..89c4273455 100644 --- a/cpp/subprojects/boosting/src/mlrl/boosting/statistics/statistics_provider_label_wise_sparse.cpp +++ b/cpp/subprojects/boosting/src/mlrl/boosting/statistics/statistics_provider_label_wise_sparse.cpp @@ -7,7 +7,6 @@ #include "mlrl/boosting/data/matrix_sparse_set_numeric.hpp" #include "mlrl/boosting/data/vector_statistic_label_wise_sparse.hpp" -#include "mlrl/boosting/data/view_histogram_label_wise_sparse.hpp" #include "mlrl/common/util/openmp.hpp" #include "statistics_label_wise_common.hpp" #include "statistics_provider_label_wise.hpp" @@ -29,40 +28,6 @@ namespace boosting { : MatrixDecorator>>(SparseSetView>(numRows, numCols)) {} }; - /** - * A histogram that stores gradients and Hessians that have been calculated using a label-wise decomposable - * loss function in the list of lists (LIL) format. - */ - class SparseLabelWiseHistogram final - : public ClearableViewDecorator> { - public: - - /** - * @param numBins The number of bins in the histogram - * @param numCols The number of columns in the histogram - */ - SparseLabelWiseHistogram(uint32 numBins, uint32 numCols) - : ClearableViewDecorator>( - SparseLabelWiseHistogramView(numBins, numCols)) {} - - /** - * Adds all gradients and Hessians in a vector to a specific row of this histogram. The gradients and - * Hessians to be added are multiplied by a specific weight. - * - * @param row The row - * @param begin An iterator to the beginning of the vector - * @param end An iterator to the end of the vector - * @param weight The weight, the gradients and Hessians should be multiplied by - */ - void addToRow(uint32 row, SparseSetView>::value_const_iterator begin, - SparseSetView>::value_const_iterator end, float64 weight) { - if (weight != 0) { - this->view.secondView[row] += weight; - addToSparseLabelWiseStatisticVector(this->view.firstView.values_begin(row), begin, end, weight); - } - } - }; - /** * Provides access to gradients and Hessians that have been calculated according to a differentiable loss * function that is applied label-wise and are stored using sparse data structures. @@ -72,9 +37,9 @@ namespace boosting { template class SparseLabelWiseStatistics final : public AbstractLabelWiseStatistics, ISparseLabelWiseLoss, - ISparseEvaluationMeasure, ISparseLabelWiseRuleEvaluationFactory> { + SparseLabelWiseStatisticMatrix, NumericSparseSetMatrix, + ISparseLabelWiseLoss, ISparseEvaluationMeasure, + ISparseLabelWiseRuleEvaluationFactory> { public: /** @@ -101,9 +66,9 @@ namespace boosting { std::unique_ptr statisticViewPtr, std::unique_ptr> scoreMatrixPtr) : AbstractLabelWiseStatistics, ISparseLabelWiseLoss, - ISparseEvaluationMeasure, ISparseLabelWiseRuleEvaluationFactory>( + SparseLabelWiseStatisticMatrix, NumericSparseSetMatrix, + ISparseLabelWiseLoss, ISparseEvaluationMeasure, + ISparseLabelWiseRuleEvaluationFactory>( std::move(lossPtr), std::move(evaluationMeasurePtr), ruleEvaluationFactory, labelMatrix, std::move(statisticViewPtr), std::move(scoreMatrixPtr)) {} diff --git a/cpp/subprojects/common/include/mlrl/common/binning/bin_index_vector.hpp b/cpp/subprojects/common/include/mlrl/common/binning/bin_index_vector.hpp deleted file mode 100644 index 688ea90382..0000000000 --- a/cpp/subprojects/common/include/mlrl/common/binning/bin_index_vector.hpp +++ /dev/null @@ -1,53 +0,0 @@ -/* - * @author Michael Rapp (michael.rapp.ml@gmail.com) - */ -#pragma once - -#include "mlrl/common/data/types.hpp" - -#include - -// Forward declarations -class IHistogram; -class IWeightedStatistics; - -/** - * Defines an interface for all classes that provide access to the indices of the bins, individual examples have been - * assigned to. - */ -class IBinIndexVector { - public: - - /** - * The index of the bin that contains sparse values. - */ - static const uint32 BIN_INDEX_SPARSE = std::numeric_limits::max(); - - virtual ~IBinIndexVector() {} - - /** - * Returns the index of the bin, the example at a specific index has been assigned to. - * - * @param exampleIndex The index of the example - * @return The index of the bin, the example has been assigned to - */ - virtual uint32 getBinIndex(uint32 exampleIndex) const = 0; - - /** - * Sets the index of the bin, the examples at a specific index should be assigned to. - * - * @param exampleIndex The index of the example - * @param binIndex The index of the bin, the example should be assigned to - */ - virtual void setBinIndex(uint32 exampleIndex, uint32 binIndex) = 0; - - /** - * Creates and returns a new histogram based on given statistics and the indices that are stored by this vector. - * - * @param statistics A reference to an object of type `IWeightedStatistics` that should be used - * @param numBins The number of bins in the histogram - * @return An unique pointer to an object of type `IHistogram` that has been created - */ - virtual std::unique_ptr createHistogram(const IWeightedStatistics& statistics, - uint32 numBins) const = 0; -}; diff --git a/cpp/subprojects/common/include/mlrl/common/binning/bin_index_vector_dense.hpp b/cpp/subprojects/common/include/mlrl/common/binning/bin_index_vector_dense.hpp deleted file mode 100644 index 50d7560089..0000000000 --- a/cpp/subprojects/common/include/mlrl/common/binning/bin_index_vector_dense.hpp +++ /dev/null @@ -1,27 +0,0 @@ -/* - * @author Michael Rapp (michael.rapp.ml@gmail.com) - */ -#pragma once - -#include "mlrl/common/binning/bin_index_vector.hpp" -#include "mlrl/common/data/vector_dense.hpp" - -/** - * Stores the indices of the bins, individual examples have been assigned to, using a C-contiguous array. - */ -class DenseBinIndexVector final : public DenseVectorDecorator>, - public IBinIndexVector { - public: - - /** - * @param numElements The number of elements in the vector - */ - DenseBinIndexVector(uint32 numElements); - - uint32 getBinIndex(uint32 exampleIndex) const override; - - void setBinIndex(uint32 exampleIndex, uint32 binIndex) override; - - std::unique_ptr createHistogram(const IWeightedStatistics& statistics, - uint32 numBins) const override; -}; diff --git a/cpp/subprojects/common/include/mlrl/common/binning/bin_index_vector_dok.hpp b/cpp/subprojects/common/include/mlrl/common/binning/bin_index_vector_dok.hpp deleted file mode 100644 index ff1c2c63c4..0000000000 --- a/cpp/subprojects/common/include/mlrl/common/binning/bin_index_vector_dok.hpp +++ /dev/null @@ -1,25 +0,0 @@ -/* - * @author Michael Rapp (michael.rapp.ml@gmail.com) - */ -#pragma once - -#include "mlrl/common/binning/bin_index_vector.hpp" -#include "mlrl/common/data/vector_dok.hpp" - -/** - * Stores the indices of the bins, individual examples have been assigned to, using the dictionaries of keys (DOK) - * format. - */ -class DokBinIndexVector final : public DokVectorDecorator>, - public IBinIndexVector { - public: - - DokBinIndexVector(); - - uint32 getBinIndex(uint32 exampleIndex) const override; - - void setBinIndex(uint32 exampleIndex, uint32 binIndex) override; - - std::unique_ptr createHistogram(const IWeightedStatistics& statistics, - uint32 numBins) const override; -}; diff --git a/cpp/subprojects/common/include/mlrl/common/binning/bin_weight_vector.hpp b/cpp/subprojects/common/include/mlrl/common/binning/bin_weight_vector.hpp deleted file mode 100644 index ee71403bac..0000000000 --- a/cpp/subprojects/common/include/mlrl/common/binning/bin_weight_vector.hpp +++ /dev/null @@ -1,33 +0,0 @@ -/* - * @author Michael Rapp (michael.rapp.ml@gmail.com) - */ -#pragma once - -#include "mlrl/common/data/view_vector.hpp" - -/** - * A vector that stores the weights of individual bins, i.e., how many examples have been assigned to them. - */ -class BinWeightVector final : public ClearableViewDecorator>> { - public: - - /** - * @param numElements The number of elements in the vector - */ - BinWeightVector(uint32 numElements); - - /** - * Increases the weight at a specific position by one. - * - * @param pos The position - */ - void increaseWeight(uint32 pos); - - /** - * Returns whether the weight at a specific position is non-zero or not. - * - * @param pos The position - * @return True, if the weight is non-zero, false otherwise - */ - bool operator[](uint32 pos) const; -}; diff --git a/cpp/subprojects/common/include/mlrl/common/binning/feature_binning.hpp b/cpp/subprojects/common/include/mlrl/common/binning/feature_binning.hpp deleted file mode 100644 index 060bf2ffcc..0000000000 --- a/cpp/subprojects/common/include/mlrl/common/binning/feature_binning.hpp +++ /dev/null @@ -1,92 +0,0 @@ -/* - * @author Lukas Johannes Eberle (lukasjohannes.eberle@stud.tu-darmstadt.de) - * @author Michael Rapp (michael.rapp.ml@gmail.com) - */ -#pragma once - -#include "mlrl/common/binning/bin_index_vector.hpp" -#include "mlrl/common/binning/threshold_vector.hpp" -#include "mlrl/common/input/feature_matrix.hpp" -#include "mlrl/common/input/feature_vector.hpp" -#include "mlrl/common/input/label_matrix.hpp" -#include "mlrl/common/thresholds/thresholds.hpp" - -#include - -/** - * Defines an interface for methods that assign feature values to bins. - */ -class IFeatureBinning { - public: - - /** - * The result that is returned by a binning method. It contains an unique pointer to a vector that stores the - * thresholds that result from the boundaries of the bins, as well as to a vector that stores the indices of the - * bins, individual values have been assigned to. - */ - struct Result final { - public: - - /** - * An unique pointer to an object of type `ThresholdVector` that provides access to the thresholds that - * result from the boundaries of the bins. - */ - std::unique_ptr thresholdVectorPtr; - - /** - * An unique pointer to an object of type `IBinIndexVector` that provides access to the indices of the - * bins, individual values have been assigned to. - */ - std::unique_ptr binIndicesPtr; - }; - - virtual ~IFeatureBinning() {} - - /** - * Assigns the values in a given `FeatureVector` to bins. - * - * @param featureVector A reference to an object of type `FeatureVector` whose values should be assigned to bins - * @param numExamples The total number of available training examples - * @return An object of type `Result` that contains a vector, which stores thresholds that result - * from the boundaries between the bins, as well as a vector that stores the indices of the - * bins, individual values have been assigned to - */ - virtual Result createBins(FeatureVector& featureVector, uint32 numExamples) const = 0; -}; - -/** - * Defines an interface for all factories that allow to create instances of the type `IFeatureBinning`. - */ -class IFeatureBinningFactory { - public: - - virtual ~IFeatureBinningFactory() {} - - /** - * Creates and returns a new object of type `IFeatureBinning`. - * - * @return An unique pointer to an object of type `IFeatureBinning` that has been created - */ - virtual std::unique_ptr create() const = 0; -}; - -/** - * Defines an interface for all classes that allow to configure a method that assigns feature values to bins. - */ -class IFeatureBinningConfig { - public: - - virtual ~IFeatureBinningConfig() {} - - /** - * Creates and returns a new object of type `IThresholdsFactory` according to the specified configuration. - * - * @param featureMatrix A reference to an object of type `IFeatureMatrix` that provides access to the feature - * values of the training examples - * @param labelMatrix A reference to an object of type `ILabelMatrix` that provides access to the labels of - * the training examples - * @return An unique pointer to an object of type `IThresholdsFactory` that has been created - */ - virtual std::unique_ptr createThresholdsFactory(const IFeatureMatrix& featureMatrix, - const ILabelMatrix& labelMatrix) const = 0; -}; diff --git a/cpp/subprojects/common/include/mlrl/common/binning/feature_binning_no.hpp b/cpp/subprojects/common/include/mlrl/common/binning/feature_binning_no.hpp deleted file mode 100644 index 3014ffede5..0000000000 --- a/cpp/subprojects/common/include/mlrl/common/binning/feature_binning_no.hpp +++ /dev/null @@ -1,28 +0,0 @@ -/* - * @author Michael Rapp (michael.rapp.ml@gmail.com) - */ -#pragma once - -#include "mlrl/common/binning/feature_binning.hpp" -#include "mlrl/common/multi_threading/multi_threading.hpp" - -/** - * Allows to configure a method that does not actually perform any feature binning. - */ -class NoFeatureBinningConfig final : public IFeatureBinningConfig { - private: - - const std::unique_ptr& multiThreadingConfigPtr_; - - public: - - /** - * @param multiThreadingConfigPtr A reference to an unique pointer that stores the configuration of the - * multi-threading behavior that should be used for the parallel update of - * statistics - */ - NoFeatureBinningConfig(const std::unique_ptr& multiThreadingConfigPtr); - - std::unique_ptr createThresholdsFactory(const IFeatureMatrix& featureMatrix, - const ILabelMatrix& labelMatrix) const override; -}; diff --git a/cpp/subprojects/common/include/mlrl/common/binning/threshold_vector.hpp b/cpp/subprojects/common/include/mlrl/common/binning/threshold_vector.hpp deleted file mode 100644 index 8d2f929569..0000000000 --- a/cpp/subprojects/common/include/mlrl/common/binning/threshold_vector.hpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * @author Michael Rapp (michael.rapp.ml@gmail.com) - */ -#pragma once - -#include "mlrl/common/data/vector_dense.hpp" -#include "mlrl/common/input/missing_feature_vector.hpp" - -/** - * An one-dimensional vector that stores thresholds that may be used by conditions. - */ -class ThresholdVector final : public ResizableVectorDecorator>>, - public OldMissingFeatureVector { - private: - - uint32 sparseBinIndex_; - - public: - - /** - * @param missingFeatureVector A reference to an object of type `OldMissingFeatureVector` the missing indices - * should be taken from - * @param numElements The number of elements in the vector - * @param init True, if all elements in the vector should be value-initialized, false otherwise - */ - ThresholdVector(OldMissingFeatureVector& missingFeatureVector, uint32 numElements, bool init = false); - - /** - * Returns the index of the bin, sparse values have been assigned to. - * - * @return The index of the bin, sparse values have been assigned to. If there is no such bin, the returned - * index is equal to `getNumElements()` - */ - uint32 getSparseBinIndex() const; - - /** - * Sets the index of the bin, sparse values have been assigned to. - * - * @param sparseBinIndex The index to be set - */ - void setSparseBinIndex(uint32 sparseBinIndex); - - void setNumElements(uint32 numElements, bool freeMemory) override; -}; diff --git a/cpp/subprojects/common/include/mlrl/common/data/vector_dok.hpp b/cpp/subprojects/common/include/mlrl/common/data/vector_dok.hpp deleted file mode 100644 index b5831c3086..0000000000 --- a/cpp/subprojects/common/include/mlrl/common/data/vector_dok.hpp +++ /dev/null @@ -1,15 +0,0 @@ -/* - * @author Michael Rapp (michael.rapp.ml@gmail.com) - */ -#pragma once - -#include "mlrl/common/data/view_vector_dok.hpp" - -/** - * Provides read and write access via iterators to all non-zero values stored in vector in the dictionary of keys (DOK) - * format. - * - * @tparam Vector The type of the vector - */ -template -using DokVectorDecorator = IterableDokVectorDecorator>; diff --git a/cpp/subprojects/common/include/mlrl/common/data/view_vector_dok.hpp b/cpp/subprojects/common/include/mlrl/common/data/view_vector_dok.hpp deleted file mode 100644 index 4834395cb0..0000000000 --- a/cpp/subprojects/common/include/mlrl/common/data/view_vector_dok.hpp +++ /dev/null @@ -1,245 +0,0 @@ -/* - * @author Michael Rapp (michael.rapp.ml@gmail.com) - */ -#pragma once - -#include "mlrl/common/data/view.hpp" - -#include - -/** - * A view that provides access to values stored in a sparse vector in the dictionary of keys (DOK) format. - * - * @tparam T The type of the valeus stored in the vector - */ -template -class MLRLCOMMON_API DokVector { - protected: - - /** - * A pointer to an object of type `std::unordered_map` that stores the indices and values of all non-zero - * elements in the view. - */ - std::unordered_map* values_; - - public: - - /** - * The value of sparse elements. - */ - const T sparseValue; - - /** - * @param values A pointer to an object of type `std::unordered_map` that stores the indices and values - * of all non-zero elements in the view - * @param sparseValue The value of sparse elements - */ - DokVector(std::unordered_map* values, T sparseValue) : values_(values), sparseValue(sparseValue) {} - - /** - * @param other A reference to an object of type `DokVector` that should be copied - */ - DokVector(const DokVector& other) : values_(other.values_), sparseValue(other.sparseValue) {} - - /** - * @param other A reference to an object of type `DokVector` that should be moved - */ - DokVector(DokVector&& other) : values_(other.values_), sparseValue(other.sparseValue) {} - - virtual ~DokVector() {} - - /** - * The type of the indices, the view provides access to. - */ - typedef uint32 index_type; - - /** - * The type of the values, the view provides access to. - */ - typedef T value_type; - - /** - * An iterator that provides read-only access to non-zero elements in the vector. - */ - typedef typename std::unordered_map::const_iterator const_iterator; - - /** - * An iterator that provides access to non-zero elements in the vector and allows to modify them. - */ - typedef typename std::unordered_map::iterator iterator; - - /** - * Returns a `const_iterator` to the beginning of the vector. - * - * @return A `const_iterator` to the beginning - */ - const_iterator cbegin() const { - return values_->cbegin(); - } - - /** - * Returns a `const_iterator` to the end of the vector. - * - * @return A `const_iterator` to the end - */ - const_iterator cend() const { - return values_->cend(); - } - - /** - * Returns an `iterator` to the beginning of the vector. - * - * @return An `iterator` to the beginning - */ - iterator begin() { - return values_->begin(); - } - - /** - * Returns an `iterator` to the end of the vector. - * - * @return An `iterator` to the end - */ - iterator end() { - return values_->end(); - } - - /** - * Returns the value of the element at a specific index. - * - * @param index The index of the element - * @return The value of the element at the given index - */ - const value_type& operator[](index_type index) const { - auto it = values_->find(index); - return it != values_->cend() ? it->second : sparseValue; - } - - /** - * Sets the value of the element at a specific position. - * - * @param index The index of the element - * @param value The value to be set - */ - void set(index_type index, value_type value) { - auto result = values_->emplace(index, value); - - if (!result.second) { - result.first->second = value; - } - } - - /** - * Sets all values stored in the view to zero. - */ - void clear() { - values_->clear(); - } -}; - -/** - * Allocates the memory for a view that provides access to values stored in a sparse vector in the dictionary of keys - * (DOK) format. - * - * @tparam Vector The type of the view - */ -template -class MLRLCOMMON_API DokVectorAllocator : public Vector { - public: - - /** - * @param sparseValue The value of sparse elements - */ - DokVectorAllocator(typename Vector::value_type sparseValue = 0) - : Vector(new std::unordered_map(), sparseValue) {} - - /** - * @param other A reference to an object of type `DokVectorAllocator` that should be copied - */ - DokVectorAllocator(const DokVectorAllocator& other) : Vector(other) { - throw std::runtime_error("Objects of type DokVectorAllocator cannot be copied"); - } - - /** - * @param other A reference to an object of type `DokVectorAllocator` that should be moved - */ - DokVectorAllocator(DokVectorAllocator&& other) : Vector(std::move(other)) { - other.values_ = nullptr; - } - - virtual ~DokVectorAllocator() override { - delete Vector::values_; - } -}; - -/** - * Allocates the memory, a `DokVector` provides access to. - * - * @tparam T The type of the values stored in the `DokVector` - */ -template -using AllocatedDokVector = DokVectorAllocator>; - -/** - * Provides read and write access via iterators to all non-zero values stored in a sparse vector in the dictionary of - * keys (DOK) format. - * - * @tparam Vector The type of the vector - */ -template -class MLRLCOMMON_API IterableDokVectorDecorator : public Vector { - public: - - /** - * An iterator that provides read-only access to non-zero elements in the vector. - */ - typedef typename Vector::view_type::const_iterator const_iterator; - - /** - * An iterator that provides access to non-zero elements in the vector and allows to modify them. - */ - typedef typename Vector::view_type::iterator iterator; - - /** - * @param view The view, the vector should be backed by - */ - IterableDokVectorDecorator(typename Vector::view_type&& view) : Vector(std::move(view)) {} - - virtual ~IterableDokVectorDecorator() override {} - - /** - * Returns a `const_iterator` to the beginning of the vector. - * - * @return A `const_iterator` to the beginning - */ - const_iterator cbegin() const { - return Vector::view.cbegin(); - } - - /** - * Returns a `const_iterator` to the end of the vector. - * - * @return A `const_iterator` to the end - */ - const_iterator cend() const { - return Vector::view.cend(); - } - - /** - * Returns an `iterator` to the beginning of the vector. - * - * @return An `iterator` to the beginning - */ - iterator begin() { - return Vector::view.begin(); - } - - /** - * Returns an `iterator` to the end of the vector. - * - * @return An `iterator` to the end - */ - iterator end() { - return Vector::view.end(); - } -}; diff --git a/cpp/subprojects/common/include/mlrl/common/indices/index_vector.hpp b/cpp/subprojects/common/include/mlrl/common/indices/index_vector.hpp index 1d7732754a..8ebcd6d621 100644 --- a/cpp/subprojects/common/include/mlrl/common/indices/index_vector.hpp +++ b/cpp/subprojects/common/include/mlrl/common/indices/index_vector.hpp @@ -9,7 +9,7 @@ // Forward declarations class IRuleRefinement; -class IThresholdsSubset; +class IFeatureSubspace; /** * Defines an interface for all classes that provide random access to indices. @@ -46,11 +46,11 @@ class IIndexVector { * Creates and return a new instance of type `IRuleRefinement` that allows to search for the best refinement of * an existing rule that predicts only for the labels whose indices are stored in this vector. * - * @param thresholdsSubset A reference to an object of type `IThresholdsSubset` that should be used to create - * the instance + * @param featureSubspace A reference to an object of type `IFeatureSubspace` that should be to search for the + * refinement * @param featureIndex The index of the feature that should be considered when searching for the refinement * @return An unique pointer to an object of type `IRuleRefinement` that has been created */ - virtual std::unique_ptr createRuleRefinement(IThresholdsSubset& thresholdsSubset, + virtual std::unique_ptr createRuleRefinement(IFeatureSubspace& featureSubspace, uint32 featureIndex) const = 0; }; diff --git a/cpp/subprojects/common/include/mlrl/common/indices/index_vector_complete.hpp b/cpp/subprojects/common/include/mlrl/common/indices/index_vector_complete.hpp index 270e514fc3..ee43590860 100644 --- a/cpp/subprojects/common/include/mlrl/common/indices/index_vector_complete.hpp +++ b/cpp/subprojects/common/include/mlrl/common/indices/index_vector_complete.hpp @@ -54,6 +54,6 @@ class CompleteIndexVector final : public IIndexVector { uint32 getIndex(uint32 pos) const override; - std::unique_ptr createRuleRefinement(IThresholdsSubset& thresholdsSubset, + std::unique_ptr createRuleRefinement(IFeatureSubspace& featureSubspace, uint32 featureIndex) const override; }; diff --git a/cpp/subprojects/common/include/mlrl/common/indices/index_vector_partial.hpp b/cpp/subprojects/common/include/mlrl/common/indices/index_vector_partial.hpp index c50b07b597..3433da667c 100644 --- a/cpp/subprojects/common/include/mlrl/common/indices/index_vector_partial.hpp +++ b/cpp/subprojects/common/include/mlrl/common/indices/index_vector_partial.hpp @@ -25,6 +25,6 @@ class PartialIndexVector final : public ResizableVectorDecorator createRuleRefinement(IThresholdsSubset& thresholdsSubset, + std::unique_ptr createRuleRefinement(IFeatureSubspace& featureSubspace, uint32 featureIndex) const override; }; diff --git a/cpp/subprojects/common/include/mlrl/common/input/feature_binning.hpp b/cpp/subprojects/common/include/mlrl/common/input/feature_binning.hpp new file mode 100644 index 0000000000..b0c0d92557 --- /dev/null +++ b/cpp/subprojects/common/include/mlrl/common/input/feature_binning.hpp @@ -0,0 +1,56 @@ +/* + * @author Lukas Johannes Eberle (lukasjohannes.eberle@stud.tu-darmstadt.de) + * @author Michael Rapp (michael.rapp.ml@gmail.com) + */ +#pragma once + +#include "mlrl/common/input/feature_matrix.hpp" +#include "mlrl/common/input/feature_type.hpp" +#include "mlrl/common/input/label_matrix.hpp" + +/** + * Defines an interface for methods that assign feature values to bins. + */ +class IFeatureBinning : public IFeatureType { + public: + + virtual ~IFeatureBinning() override {} +}; + +/** + * Defines an interface for all factories that allow to create instances of the type `IFeatureBinning`. + */ +class IFeatureBinningFactory { + public: + + virtual ~IFeatureBinningFactory() {} + + /** + * Creates and returns a new object of type `IFeatureBinning`. + * + * @return An unique pointer to an object of type `IFeatureBinning` that has been created or a null pointer, if + * no feature binning should be used + */ + virtual std::unique_ptr create() const = 0; +}; + +/** + * Defines an interface for all classes that allow to configure a method that assigns feature values to bins. + */ +class IFeatureBinningConfig { + public: + + virtual ~IFeatureBinningConfig() {} + + /** + * Creates and returns a new object of type `IFeatureBinningFactory` according to the specified configuration. + * + * @param featureMatrix A reference to an object of type `IFeatureMatrix` that provides access to the feature + * values of the training examples + * @param labelMatrix A reference to an object of type `ILabelMatrix` that provides access to the labels of + * the training examples + * @return An unique pointer to an object of type `IFeatureBinningFactory` that has been created + */ + virtual std::unique_ptr createFeatureBinningFactory( + const IFeatureMatrix& featureMatrix, const ILabelMatrix& labelMatrix) const = 0; +}; diff --git a/cpp/subprojects/common/include/mlrl/common/binning/feature_binning_equal_frequency.hpp b/cpp/subprojects/common/include/mlrl/common/input/feature_binning_equal_frequency.hpp similarity index 81% rename from cpp/subprojects/common/include/mlrl/common/binning/feature_binning_equal_frequency.hpp rename to cpp/subprojects/common/include/mlrl/common/input/feature_binning_equal_frequency.hpp index 27d5f30c59..33ee09eb9f 100644 --- a/cpp/subprojects/common/include/mlrl/common/binning/feature_binning_equal_frequency.hpp +++ b/cpp/subprojects/common/include/mlrl/common/input/feature_binning_equal_frequency.hpp @@ -4,8 +4,7 @@ */ #pragma once -#include "mlrl/common/binning/feature_binning.hpp" -#include "mlrl/common/multi_threading/multi_threading.hpp" +#include "mlrl/common/input/feature_binning.hpp" /** * Defines an interface for all classes that allow to configure a method that assigns numerical feature values to bins, @@ -82,16 +81,9 @@ class EqualFrequencyFeatureBinningConfig final : public IFeatureBinningConfig, uint32 maxBins_; - const std::unique_ptr& multiThreadingConfigPtr_; - public: - /** - * @param multiThreadingConfigPtr A reference to an unique pointer that stores the configuration of the - * multi-threading behavior that should be used for the parallel update of - * statistics - */ - EqualFrequencyFeatureBinningConfig(const std::unique_ptr& multiThreadingConfigPtr); + EqualFrequencyFeatureBinningConfig(); float32 getBinRatio() const override; @@ -105,6 +97,6 @@ class EqualFrequencyFeatureBinningConfig final : public IFeatureBinningConfig, IEqualFrequencyFeatureBinningConfig& setMaxBins(uint32 maxBins) override; - std::unique_ptr createThresholdsFactory(const IFeatureMatrix& featureMatrix, - const ILabelMatrix& labelMatrix) const override; + std::unique_ptr createFeatureBinningFactory( + const IFeatureMatrix& featureMatrix, const ILabelMatrix& labelMatrix) const override; }; diff --git a/cpp/subprojects/common/include/mlrl/common/binning/feature_binning_equal_width.hpp b/cpp/subprojects/common/include/mlrl/common/input/feature_binning_equal_width.hpp similarity index 81% rename from cpp/subprojects/common/include/mlrl/common/binning/feature_binning_equal_width.hpp rename to cpp/subprojects/common/include/mlrl/common/input/feature_binning_equal_width.hpp index 0b2e353a65..8179018082 100644 --- a/cpp/subprojects/common/include/mlrl/common/binning/feature_binning_equal_width.hpp +++ b/cpp/subprojects/common/include/mlrl/common/input/feature_binning_equal_width.hpp @@ -4,8 +4,7 @@ */ #pragma once -#include "mlrl/common/binning/feature_binning.hpp" -#include "mlrl/common/multi_threading/multi_threading.hpp" +#include "mlrl/common/input/feature_binning.hpp" /** * Defines an interface for all classes that allow to configure a method that assigns numerical feature values to bins, @@ -82,16 +81,9 @@ class EqualWidthFeatureBinningConfig final : public IFeatureBinningConfig, uint32 maxBins_; - const std::unique_ptr& multiThreadingConfigPtr_; - public: - /** - * @param multiThreadingConfigPtr A reference to an unique pointer that stores the configuration of the - * multi-threading behavior that should be used for the parallel update of - * statistics - */ - EqualWidthFeatureBinningConfig(const std::unique_ptr& multiThreadingConfigPtr); + EqualWidthFeatureBinningConfig(); float32 getBinRatio() const override; @@ -105,6 +97,6 @@ class EqualWidthFeatureBinningConfig final : public IFeatureBinningConfig, IEqualWidthFeatureBinningConfig& setMaxBins(uint32 maxBins) override; - std::unique_ptr createThresholdsFactory(const IFeatureMatrix& featureMatrix, - const ILabelMatrix& labelMatrix) const override; + std::unique_ptr createFeatureBinningFactory( + const IFeatureMatrix& featureMatrix, const ILabelMatrix& labelMatrix) const override; }; diff --git a/cpp/subprojects/common/include/mlrl/common/input/feature_binning_no.hpp b/cpp/subprojects/common/include/mlrl/common/input/feature_binning_no.hpp new file mode 100644 index 0000000000..69ddbc9e58 --- /dev/null +++ b/cpp/subprojects/common/include/mlrl/common/input/feature_binning_no.hpp @@ -0,0 +1,16 @@ +/* + * @author Michael Rapp (michael.rapp.ml@gmail.com) + */ +#pragma once + +#include "mlrl/common/input/feature_binning.hpp" + +/** + * Allows to configure a method that does not actually perform any feature binning. + */ +class NoFeatureBinningConfig final : public IFeatureBinningConfig { + public: + + std::unique_ptr createFeatureBinningFactory( + const IFeatureMatrix& featureMatrix, const ILabelMatrix& labelMatrix) const override; +}; diff --git a/cpp/subprojects/common/include/mlrl/common/input/feature_info.hpp b/cpp/subprojects/common/include/mlrl/common/input/feature_info.hpp index c2f20a5fa3..2a84b3bcd7 100644 --- a/cpp/subprojects/common/include/mlrl/common/input/feature_info.hpp +++ b/cpp/subprojects/common/include/mlrl/common/input/feature_info.hpp @@ -4,6 +4,7 @@ #pragma once #include "mlrl/common/data/types.hpp" +#include "mlrl/common/input/feature_binning.hpp" #include "mlrl/common/input/feature_type.hpp" #include "mlrl/common/util/dll_exports.hpp" @@ -21,7 +22,12 @@ class MLRLCOMMON_API IFeatureInfo { * Creates and returns a new object of type `IFeatureType` that corresponds to the type of the feature at a * specific index. * - * @return An unique pointer to an object of the type `IFeatureType` that has been created + * @param featureIndex The index of the feature + * @param featureBinningFactory A reference to an object of type `IFeatureBinningFactory` that allows to create + * implementations of the binning method to be used for assigning numerical feature + * values to bins + * @return An unique pointer to an object of the type `IFeatureType` that has been created */ - virtual std::unique_ptr createFeatureType(uint32 featureIndex) const = 0; + virtual std::unique_ptr createFeatureType( + uint32 featureIndex, const IFeatureBinningFactory& featureBinningFactory) const = 0; }; diff --git a/cpp/subprojects/common/include/mlrl/common/input/feature_matrix_column_wise.hpp b/cpp/subprojects/common/include/mlrl/common/input/feature_matrix_column_wise.hpp index d65fb37c8e..ffddead4ee 100644 --- a/cpp/subprojects/common/include/mlrl/common/input/feature_matrix_column_wise.hpp +++ b/cpp/subprojects/common/include/mlrl/common/input/feature_matrix_column_wise.hpp @@ -17,18 +17,6 @@ class MLRLCOMMON_API IColumnWiseFeatureMatrix : public IFeatureMatrix { virtual ~IColumnWiseFeatureMatrix() override {} - /** - * Fetches a feature vector that stores the indices of the training examples, as well as their feature values, - * for a specific feature and stores it in a given unique pointer. - * - * @param featureIndex The index of the feature - * @param featureVectorPtr An unique pointer to an object of type `FeatureVector` that should be used to store - * the feature vector - */ - // TODO Remove - virtual void fetchFeatureVector(uint32 featureIndex, - std::unique_ptr& featureVectorPtr) const = 0; - /** * Creates and returns a feature vector that stores the feature values of the available examples for a certain * feature. diff --git a/cpp/subprojects/common/include/mlrl/common/input/feature_type.hpp b/cpp/subprojects/common/include/mlrl/common/input/feature_type.hpp index a34547034b..4c31aac94f 100644 --- a/cpp/subprojects/common/include/mlrl/common/input/feature_type.hpp +++ b/cpp/subprojects/common/include/mlrl/common/input/feature_type.hpp @@ -15,22 +15,6 @@ class IFeatureType { virtual ~IFeatureType() {} - /** - * Returns whether the feature is ordinal or not. - * - * @return True, if the feature is ordinal, false otherwise - */ - // TODO Remove - virtual bool isOrdinal() const = 0; - - /** - * Returns whether the feature is nominal or not. - * - * @return True, if the feature is nominal, false otherwise - */ - // TODO Remove - virtual bool isNominal() const = 0; - /** * Creates and returns a feature vector that stores the feature values taken from a given Fortran-contiguous * matrix for a certain feature. diff --git a/cpp/subprojects/common/include/mlrl/common/input/feature_type_nominal.hpp b/cpp/subprojects/common/include/mlrl/common/input/feature_type_nominal.hpp index 378482f46d..a1e50c7076 100644 --- a/cpp/subprojects/common/include/mlrl/common/input/feature_type_nominal.hpp +++ b/cpp/subprojects/common/include/mlrl/common/input/feature_type_nominal.hpp @@ -11,10 +11,6 @@ class NominalFeatureType final : public IFeatureType { public: - bool isOrdinal() const override; - - bool isNominal() const override; - std::unique_ptr createFeatureVector( uint32 featureIndex, const FortranContiguousView& featureMatrix) const override; diff --git a/cpp/subprojects/common/include/mlrl/common/input/feature_type_numerical.hpp b/cpp/subprojects/common/include/mlrl/common/input/feature_type_numerical.hpp index 49d314f646..d12e523543 100644 --- a/cpp/subprojects/common/include/mlrl/common/input/feature_type_numerical.hpp +++ b/cpp/subprojects/common/include/mlrl/common/input/feature_type_numerical.hpp @@ -11,10 +11,6 @@ class NumericalFeatureType final : public IFeatureType { public: - bool isOrdinal() const override; - - bool isNominal() const override; - std::unique_ptr createFeatureVector( uint32 featureIndex, const FortranContiguousView& featureMatrix) const override; diff --git a/cpp/subprojects/common/include/mlrl/common/input/feature_type_ordinal.hpp b/cpp/subprojects/common/include/mlrl/common/input/feature_type_ordinal.hpp index b94fae8ad9..8ed7dbfb1e 100644 --- a/cpp/subprojects/common/include/mlrl/common/input/feature_type_ordinal.hpp +++ b/cpp/subprojects/common/include/mlrl/common/input/feature_type_ordinal.hpp @@ -11,10 +11,6 @@ class OrdinalFeatureType final : public IFeatureType { public: - bool isOrdinal() const override; - - bool isNominal() const override; - std::unique_ptr createFeatureVector( uint32 featureIndex, const FortranContiguousView& featureMatrix) const override; diff --git a/cpp/subprojects/common/include/mlrl/common/input/feature_vector.hpp b/cpp/subprojects/common/include/mlrl/common/input/feature_vector.hpp index eb600f779f..0d3979d7f8 100644 --- a/cpp/subprojects/common/include/mlrl/common/input/feature_vector.hpp +++ b/cpp/subprojects/common/include/mlrl/common/input/feature_vector.hpp @@ -3,13 +3,10 @@ */ #pragma once -#include "mlrl/common/data/indexed_value.hpp" -#include "mlrl/common/data/vector_dense.hpp" #include "mlrl/common/input/interval.hpp" -#include "mlrl/common/input/missing_feature_vector.hpp" -#include "mlrl/common/rule_refinement/rule_refinement_search.hpp" +#include "mlrl/common/rule_refinement/coverage_mask.hpp" +#include "mlrl/common/rule_refinement/feature_based_search.hpp" #include "mlrl/common/statistics/statistics_weighted.hpp" -#include "mlrl/common/thresholds/coverage_mask.hpp" /** * Defines an interface for all one-dimensional vectors that store the values of training examples for a certain @@ -23,7 +20,7 @@ class IFeatureVector { /** * Conducts a search for the best refinement of an existing rule that can be created from a this feature vector. * - * @param ruleRefinementSearch A reference to an object of type `RuleRefinementSearch` that should be + * @param featureBasedSearch A reference to an object of type `FeatureBasedSearch` that should be * used for conducting the search * @param statisticsSubset A reference to an object of type `IWeightedStatisticsSubset` that * provides access to weighted statistics about the labels of the training @@ -37,7 +34,7 @@ class IFeatureVector { * @param refinement A reference to an object of type `Refinement` that should be used for * storing the properties of the best refinement that is found */ - virtual void searchForRefinement(RuleRefinementSearch& ruleRefinementSearch, + virtual void searchForRefinement(FeatureBasedSearch& featureBasedSearch, IWeightedStatisticsSubset& statisticsSubset, SingleRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, Refinement& refinement) const = 0; @@ -45,7 +42,7 @@ class IFeatureVector { /** * Conducts a search for the best refinement of an existing rule that can be created from a this feature vector. * - * @param ruleRefinementSearch A reference to an object of type `RuleRefinementSearch` that should be + * @param featureBasedSearch A reference to an object of type `FeatureBasedSearch` that should be * used for conducting the search * @param statisticsSubset A reference to an object of type `IWeightedStatisticsSubset` that * provides access to weighted statistics about the labels of the training @@ -59,7 +56,7 @@ class IFeatureVector { * @param refinement A reference to an object of type `Refinement` that should be used for * storing the properties of the best refinement that is found */ - virtual void searchForRefinement(RuleRefinementSearch& ruleRefinementSearch, + virtual void searchForRefinement(FeatureBasedSearch& featureBasedSearch, IWeightedStatisticsSubset& statisticsSubset, FixedRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, Refinement& refinement) const = 0; @@ -103,24 +100,3 @@ class IFeatureVector { virtual std::unique_ptr createFilteredFeatureVector(std::unique_ptr& existing, const CoverageMask& coverageMask) const = 0; }; - -/** - * An one-dimensional sparse vector that stores the values of training examples for a certain feature, as well as the - * indices of examples with missing feature values. - */ -// TODO Remove -class FeatureVector final - : public ResizableVectorDecorator>>>, - public OldMissingFeatureVector { - public: - - /** - * @param numElements The number of elements in the vector - */ - FeatureVector(uint32 numElements); - - /** - * Sorts the elements in the vector in ascending order based on their values. - */ - void sortByValues(); -}; diff --git a/cpp/subprojects/common/include/mlrl/common/input/feature_vector_binned.hpp b/cpp/subprojects/common/include/mlrl/common/input/feature_vector_binned.hpp new file mode 100644 index 0000000000..39e5c769fa --- /dev/null +++ b/cpp/subprojects/common/include/mlrl/common/input/feature_vector_binned.hpp @@ -0,0 +1,193 @@ +/* + * @author Michael Rapp (michael.rapp.ml@gmail.com) + */ +#pragma once + +#include "mlrl/common/data/view.hpp" + +/** + * A feature vector that stores the indices of the examples that are associated with each bin, except for the most + * frequent one, created by a method that assigns numerical feature values to bins. + */ +class MLRLCOMMON_API BinnedFeatureVector { + public: + + /** + * A pointer to an array that stores thresholds separating adjacent bins. + */ + float32* thresholds; + + /** + * A pointer to an array that stores the indices of all examples not associated with the most frequent bin. + */ + uint32* indices; + + /** + * A pointer to an array that stores the indices of the first element in `indices` that corresponds to a certain + * bin. + */ + uint32* indptr; + + /** + * The number of bins, excluding the most frequent one. + */ + uint32 numBins; + + /** + * The index of the most frequent bin. + */ + uint32 sparseBinIndex; + + public: + + /** + * @param thresholds A pointer to an array of type `float32`, shape `(numBins - 1)` that stores + * thresholds separating bins + * @param indices A pointer to an array of type `uint32`, shape `(numIndices)` that stores the indices + * of all examples not associated with the most frequent bin + * @param indptr A pointer to an array that stores the indices of the first element in `indices` that + * corresponds to a certain bin + * @param numBins The number of bins, including the most frequent one + * @param numIndices The number of elements in the array `indices` + * @param sparseBinIndex The index of the most frequent bin + */ + BinnedFeatureVector(float32* thresholds, uint32* indices, uint32* indptr, uint32 numBins, uint32 numIndices, + uint32 sparseBinIndex); + + /** + * @param other A reference to an object of type `BinnedFeatureVector` that should be copied + */ + BinnedFeatureVector(const BinnedFeatureVector& other); + + /** + * @param other A reference to an object of type `BinnedFeatureVector` that should be moved + */ + BinnedFeatureVector(BinnedFeatureVector&& other); + + virtual ~BinnedFeatureVector() {}; + + /** + * The type of the indices, the view provides access to. + */ + typedef uint32 index_type; + + /** + * The type of the thresholds, the view provides access to. + */ + typedef float32 threshold_type; + + /** + * An iterator that provides read-only access to all thresholds. + */ + typedef const float32* threshold_const_iterator; + + /** + * An iterator that provides access to all thresholds and allows to modify them. + */ + typedef float32* threshold_iterator; + + /** + * An iterator that provides read-only access to the indices of the examples that are associated with each bin, + * except for the most frequent bin. + */ + typedef const uint32* index_const_iterator; + + /** + * An iterator that provides access to the indices of the examples that are associated with each bin, except for + * the most frequent bin, and allows to modify them. + */ + typedef uint32* index_iterator; + + /** + * Returns a `threshold_const_iterator` to the beginning of the thresholds. + * + * @return A `threshold_const_iterator` to the beginning + */ + threshold_const_iterator thresholds_cbegin() const; + + /** + * Returns a `value_const_iterator` to the end of the thresholds. + * + * @return A `value_const_iterator` to the end + */ + threshold_const_iterator thresholds_cend() const; + + /** + * Returns a `value_iterator` to the beginning of the thresholds. + * + * @return A `value_iterator` to the beginning + */ + threshold_iterator thresholds_begin(); + + /** + * Returns a `threshold_iterator` to the end of the thresholds. + * + * @return A `threshld_iterator` to the end + */ + threshold_iterator thresholds_end(); + + /** + * Returns an `index_const_iterator` to the beginning of the indices of the examples that are associated with a + * specific bin. + * + * @param index The index of the bin + * @return An `index_const_iterator` to the beginning + */ + index_const_iterator indices_cbegin(uint32 index) const; + + /** + * Returns an `index_const_iterator` to the end of the indices of the examples that are associated with a + * specific bin. + * + * @param index The index of the bin + * @return An `index_const_iterator` to the end + */ + index_const_iterator indices_cend(uint32 index) const; + + /** + * Returns an `index_iterator` to the beginning of the indices of the examples that are associated with a + * specific bin. + * + * @param index The index of the bin + * @return An `index_iterator` to the beginning + */ + index_iterator indices_begin(uint32 index); + + /** + * Returns an `index_iterator` to the end of the indices of the examples that are associated with a specific + * bin. + * + * @param index The index of the bin + * @return An `index_iterator` to the end + */ + index_iterator indices_end(uint32 index); + + /** + * Releases the ownership of the array that stores the thresholds. As a result, the behavior of this view + * becomes undefined and it should not be used anymore. The caller is responsible for freeing the memory that is + * occupied by the array. + * + * @return A pointer to an array that stores all thresholds + */ + threshold_type* releaseThresholds(); + + /** + * Releases the ownership of the array that stores the indices of all examples not associated with the most + * frequent bin. As a result, the behavior of this view becomes undefined and it should not be used anymore. The + * caller is responsible for freeing the memory that is occupied by the array. + * + * @return A pointer to the array that stores the indices of all examples not associated with the most frequent + * bin + */ + index_type* releaseIndices(); + + /** + * Releases the ownership of the array that stores the indices of the first element in `indices` that + * corresponds to a certain bin. As a result, the behavior of this view becomes undefined and it should not be + * used anymore. The caller is responsible for freeing the memory that is occupied by the array. + * + * @return A pointer to an array that stores the indices of the first element in `indices` that corresponds to + * a certain bin + */ + index_type* releaseIndptr(); +}; diff --git a/cpp/subprojects/common/include/mlrl/common/input/feature_vector_equal.hpp b/cpp/subprojects/common/include/mlrl/common/input/feature_vector_equal.hpp index 996747d1e7..0aa8681626 100644 --- a/cpp/subprojects/common/include/mlrl/common/input/feature_vector_equal.hpp +++ b/cpp/subprojects/common/include/mlrl/common/input/feature_vector_equal.hpp @@ -12,15 +12,13 @@ class EqualFeatureVector final : public IFeatureVector { public: - void searchForRefinement(RuleRefinementSearch& ruleRefinementSearch, - IWeightedStatisticsSubset& statisticsSubset, SingleRefinementComparator& comparator, - uint32 numExamlesWithNonZeroWeights, uint32 minCoverage, - Refinement& refinement) const override; + void searchForRefinement(FeatureBasedSearch& featureBasedSearch, IWeightedStatisticsSubset& statisticsSubset, + SingleRefinementComparator& comparator, uint32 numExamlesWithNonZeroWeights, + uint32 minCoverage, Refinement& refinement) const override; - void searchForRefinement(RuleRefinementSearch& ruleRefinementSearch, - IWeightedStatisticsSubset& statisticsSubset, FixedRefinementComparator& comparator, - uint32 numExamlesWithNonZeroWeights, uint32 minCoverage, - Refinement& refinement) const override; + void searchForRefinement(FeatureBasedSearch& featureBasedSearch, IWeightedStatisticsSubset& statisticsSubset, + FixedRefinementComparator& comparator, uint32 numExamlesWithNonZeroWeights, + uint32 minCoverage, Refinement& refinement) const override; void updateCoverageMaskAndStatistics(const Interval& interval, CoverageMask& coverageMask, uint32 indicatorValue, IWeightedStatistics& statistics) const override; diff --git a/cpp/subprojects/common/include/mlrl/common/input/interval.hpp b/cpp/subprojects/common/include/mlrl/common/input/interval.hpp index 0af2e4a72c..0915ba600f 100644 --- a/cpp/subprojects/common/include/mlrl/common/input/interval.hpp +++ b/cpp/subprojects/common/include/mlrl/common/input/interval.hpp @@ -3,7 +3,7 @@ */ #pragma once -#include "mlrl/common/data/types.hpp" +#include "mlrl/common/data/tuple.hpp" /** * Specifies the boundaries of an interval that includes/excludes certain elements in a vector. @@ -51,3 +51,35 @@ struct Interval { return *this; } }; + +/** + * Returns the start and end index of an open interval `[0, interval.end]` or `[interval.start, maxIndex]`, depending on + * a given `Interval`. + * + * @param interval A reference to an object of type `Interval` + * @param maxIndex The maximum index of an open interval + * @return A `Tuple` that stores the start and end index + */ +static inline Tuple getStartAndEndOfOpenInterval(const Interval& interval, uint32 maxIndex) { + Tuple tuple; + + if (interval.inverse) { + if (interval.start > 0) { + tuple.first = 0; + tuple.second = interval.start; + } else { + tuple.first = interval.end; + tuple.second = maxIndex; + } + } else { + tuple.first = interval.start; + + if (tuple.first > 0) { + tuple.second = maxIndex; + } else { + tuple.second = interval.end; + } + } + + return tuple; +} diff --git a/cpp/subprojects/common/include/mlrl/common/input/missing_feature_vector.hpp b/cpp/subprojects/common/include/mlrl/common/input/missing_feature_vector.hpp deleted file mode 100644 index 899d2d9cfa..0000000000 --- a/cpp/subprojects/common/include/mlrl/common/input/missing_feature_vector.hpp +++ /dev/null @@ -1,68 +0,0 @@ -/* - * @author Michael Rapp (michael.rapp.ml@gmail.com) - */ -#pragma once - -#include "mlrl/common/data/view_vector_dok_binary.hpp" - -#include - -/** - * An one-dimensional sparse vector that stores the indices of training examples with missing feature values using the - * dictionary of keys (DOK) format. - */ -// TODO Remove class -class OldMissingFeatureVector { - private: - - std::unique_ptr>> missingIndicesPtr_; - - public: - - OldMissingFeatureVector(); - - /** - * @param missingFeatureVector A reference to an object of type `OldMissingFeatureVector`, the missing indices - * should be taken from - */ - OldMissingFeatureVector(OldMissingFeatureVector& missingFeatureVector); - - /** - * An iterator that provides read-only access to the missing indices. - */ - typedef BinaryDokVector::index_const_iterator missing_index_const_iterator; - - /** - * Returns a `missing_index_const_iterator` to the beginning of the missing indices. - * - * @return A `missing_index_const_iterator` to the beginning - */ - missing_index_const_iterator missing_indices_cbegin() const; - - /** - * Returns a `missing_index_const_iterator` to the end of the missing indices. - * - * @return A `missing_index_const_iterator` to the end - */ - missing_index_const_iterator missing_indices_cend() const; - - /** - * Adds the index of an example with missing feature value. - * - * @param index The index to be added - */ - void addMissingIndex(uint32 index); - - /** - * Returns whether the example at a specific index has a missing feature value. - * - * @param index The index of the example to be checked - * @return True, if the example at the given index has a missing feature value, false otherwise - */ - bool isMissing(uint32 index) const; - - /** - * Removes all indices of examples with missing feature values. - */ - void clearMissingIndices(); -}; diff --git a/cpp/subprojects/common/include/mlrl/common/learner.hpp b/cpp/subprojects/common/include/mlrl/common/learner.hpp index b68841c47e..3af5124b1a 100644 --- a/cpp/subprojects/common/include/mlrl/common/learner.hpp +++ b/cpp/subprojects/common/include/mlrl/common/learner.hpp @@ -3,9 +3,9 @@ */ #pragma once -#include "mlrl/common/binning/feature_binning_equal_frequency.hpp" -#include "mlrl/common/binning/feature_binning_equal_width.hpp" -#include "mlrl/common/binning/feature_binning_no.hpp" +#include "mlrl/common/input/feature_binning_equal_frequency.hpp" +#include "mlrl/common/input/feature_binning_equal_width.hpp" +#include "mlrl/common/input/feature_binning_no.hpp" #include "mlrl/common/input/feature_info.hpp" #include "mlrl/common/input/feature_matrix_column_wise.hpp" #include "mlrl/common/input/feature_matrix_row_wise.hpp" @@ -503,8 +503,7 @@ class MLRLCOMMON_API IRuleLearner { virtual void useNoFeatureBinning() { std::unique_ptr& featureBinningConfigPtr = this->getFeatureBinningConfigPtr(); - featureBinningConfigPtr = - std::make_unique(this->getParallelStatisticUpdateConfigPtr()); + featureBinningConfigPtr = std::make_unique(); } }; @@ -528,7 +527,7 @@ class MLRLCOMMON_API IRuleLearner { std::unique_ptr& featureBinningConfigPtr = this->getFeatureBinningConfigPtr(); std::unique_ptr ptr = - std::make_unique(this->getParallelStatisticUpdateConfigPtr()); + std::make_unique(); IEqualWidthFeatureBinningConfig& ref = *ptr; featureBinningConfigPtr = std::move(ptr); return ref; @@ -555,7 +554,7 @@ class MLRLCOMMON_API IRuleLearner { std::unique_ptr& featureBinningConfigPtr = this->getFeatureBinningConfigPtr(); std::unique_ptr ptr = - std::make_unique(this->getParallelStatisticUpdateConfigPtr()); + std::make_unique(); IEqualFrequencyFeatureBinningConfig& ref = *ptr; featureBinningConfigPtr = std::move(ptr); return ref; @@ -1813,8 +1812,8 @@ class AbstractRuleLearner : virtual public IRuleLearner { std::unique_ptr createRuleModelAssemblageFactory( const IRowWiseLabelMatrix& labelMatrix) const; - std::unique_ptr createThresholdsFactory(const IFeatureMatrix& featureMatrix, - const ILabelMatrix& labelMatrix) const; + std::unique_ptr createFeatureSpaceFactory(const IFeatureMatrix& featureMatrix, + const ILabelMatrix& labelMatrix) const; std::unique_ptr createRuleInductionFactory(const IFeatureMatrix& featureMatrix, const ILabelMatrix& labelMatrix) const; diff --git a/cpp/subprojects/common/include/mlrl/common/post_optimization/post_optimization.hpp b/cpp/subprojects/common/include/mlrl/common/post_optimization/post_optimization.hpp index 731f36f696..5095499697 100644 --- a/cpp/subprojects/common/include/mlrl/common/post_optimization/post_optimization.hpp +++ b/cpp/subprojects/common/include/mlrl/common/post_optimization/post_optimization.hpp @@ -7,9 +7,9 @@ #include "mlrl/common/post_processing/post_processor.hpp" #include "mlrl/common/rule_induction/rule_induction.hpp" #include "mlrl/common/rule_pruning/rule_pruning.hpp" +#include "mlrl/common/rule_refinement/feature_space.hpp" #include "mlrl/common/sampling/feature_sampling.hpp" #include "mlrl/common/sampling/label_sampling.hpp" -#include "mlrl/common/thresholds/thresholds.hpp" /** * Defines an interface for all classes that allow to optimize a rule-based model globally once it has been learned. @@ -22,8 +22,8 @@ class IPostOptimizationPhase { /** * Optimizes a rule-based model globally once it has been learned. * - * @param thresholds A reference to an object of type `IThresholds` that provides access to the - * thresholds that may be used by the conditions of the rule + * @param featureSpace A reference to an object of type `IFeatureSpace` that provides access to the feature + * space * @param ruleInduction A reference to an object of type `IRuleInduction` that should be used for inducing * new rules * @param partition A reference to an object of type `IPartition` that provides access to the indices of @@ -42,10 +42,11 @@ class IPostOptimizationPhase { * @param rng A reference to an object of type `RNG` that implements the random number generator * to be used */ - virtual void optimizeModel(IThresholds& thresholds, const IRuleInduction& ruleInduction, IPartition& partition, - ILabelSampling& labelSampling, IInstanceSampling& instanceSampling, - IFeatureSampling& featureSampling, const IRulePruning& rulePruning, - const IPostProcessor& postProcessor, RNG& rng) const = 0; + virtual void optimizeModel(IFeatureSpace& featureSpace, const IRuleInduction& ruleInduction, + IPartition& partition, ILabelSampling& labelSampling, + IInstanceSampling& instanceSampling, IFeatureSampling& featureSampling, + const IRulePruning& rulePruning, const IPostProcessor& postProcessor, + RNG& rng) const = 0; }; /** diff --git a/cpp/subprojects/common/include/mlrl/common/rule_induction/rule_induction.hpp b/cpp/subprojects/common/include/mlrl/common/rule_induction/rule_induction.hpp index 11fb32fec0..24639285b1 100644 --- a/cpp/subprojects/common/include/mlrl/common/rule_induction/rule_induction.hpp +++ b/cpp/subprojects/common/include/mlrl/common/rule_induction/rule_induction.hpp @@ -8,11 +8,11 @@ #include "mlrl/common/model/model_builder.hpp" #include "mlrl/common/post_processing/post_processor.hpp" #include "mlrl/common/rule_pruning/rule_pruning.hpp" +#include "mlrl/common/rule_refinement/feature_space.hpp" #include "mlrl/common/sampling/feature_sampling.hpp" #include "mlrl/common/sampling/partition.hpp" #include "mlrl/common/sampling/weight_vector.hpp" #include "mlrl/common/statistics/statistics.hpp" -#include "mlrl/common/thresholds/thresholds.hpp" /** * Defines an interface for all classes that implement an algorithm for the induction of individual rules. @@ -34,8 +34,8 @@ class IRuleInduction { /** * Induces a new rule. * - * @param thresholds A reference to an object of type `IThresholds` that provides access to the - * thresholds that may be used by the conditions of the rule + * @param featureSpace A reference to an object of type `IFeatureSpace` that provides access to the feature + * space * @param labelIndices A reference to an object of type `IIndexVector` that provides access to the indices * of the labels for which the rule may predict * @param weights A reference to an object of type `IWeightVector` that provides access to the weights @@ -54,8 +54,8 @@ class IRuleInduction { * @param modelBuilder A reference to an object of type `IModelBuilder`, the rule should be added to * @return True, if a rule has been induced, false otherwise */ - virtual bool induceRule(IThresholds& thresholds, const IIndexVector& labelIndices, const IWeightVector& weights, - IPartition& partition, IFeatureSampling& featureSampling, + virtual bool induceRule(IFeatureSpace& featureSpace, const IIndexVector& labelIndices, + const IWeightVector& weights, IPartition& partition, IFeatureSampling& featureSampling, const IRulePruning& rulePruning, const IPostProcessor& postProcessor, RNG& rng, IModelBuilder& modelBuilder) const = 0; }; diff --git a/cpp/subprojects/common/include/mlrl/common/rule_model_assemblage/rule_model_assemblage.hpp b/cpp/subprojects/common/include/mlrl/common/rule_model_assemblage/rule_model_assemblage.hpp index 67c2fbd8f9..cf64f256b1 100644 --- a/cpp/subprojects/common/include/mlrl/common/rule_model_assemblage/rule_model_assemblage.hpp +++ b/cpp/subprojects/common/include/mlrl/common/rule_model_assemblage/rule_model_assemblage.hpp @@ -6,13 +6,13 @@ #include "mlrl/common/input/label_matrix_row_wise.hpp" #include "mlrl/common/model/model_builder.hpp" #include "mlrl/common/rule_induction/rule_induction.hpp" +#include "mlrl/common/rule_refinement/feature_space.hpp" #include "mlrl/common/sampling/feature_sampling.hpp" #include "mlrl/common/sampling/instance_sampling.hpp" #include "mlrl/common/sampling/label_sampling.hpp" #include "mlrl/common/sampling/partition_sampling.hpp" #include "mlrl/common/statistics/statistics_provider.hpp" #include "mlrl/common/stopping/stopping_criterion.hpp" -#include "mlrl/common/thresholds/thresholds.hpp" /** * Defines an interface for all classes that implement an algorithm for the induction of several rules that will be @@ -42,8 +42,8 @@ class IRuleModelAssemblage { * features that may be used by the conditions of a rule * @param statisticsProvider A reference to an object of type `IStatisticsProvider` that provides access to * the statistics which serve as the basis for learning rules - * @param thresholds A reference to an object of type `IThresholds` that provides access to the - * thresholds that may be used by the conditions of rules + * @param featureSpace A reference to an object of type `IFeatureSpace` that provides access to the + * feature space * @param rng A reference to an object of type `RNG` that implements the random number * generator to be used * @param modelBuilder A reference to an object of type `IModelBuilder`, the rules should be added to @@ -52,7 +52,7 @@ class IRuleModelAssemblage { const IPostProcessor& postProcessor, IPartition& partition, ILabelSampling& labelSampling, IInstanceSampling& instanceSampling, IFeatureSampling& featureSampling, IStatisticsProvider& statisticsProvider, - IThresholds& thresholds, IModelBuilder& modelBuilder, RNG& rng) const = 0; + IFeatureSpace& featureSpace, IModelBuilder& modelBuilder, RNG& rng) const = 0; }; /** diff --git a/cpp/subprojects/common/include/mlrl/common/rule_pruning/rule_pruning.hpp b/cpp/subprojects/common/include/mlrl/common/rule_pruning/rule_pruning.hpp index 0a93e27646..e51738d908 100644 --- a/cpp/subprojects/common/include/mlrl/common/rule_pruning/rule_pruning.hpp +++ b/cpp/subprojects/common/include/mlrl/common/rule_pruning/rule_pruning.hpp @@ -4,8 +4,8 @@ #pragma once #include "mlrl/common/model/condition_list.hpp" +#include "mlrl/common/rule_refinement/feature_subspace.hpp" #include "mlrl/common/sampling/partition.hpp" -#include "mlrl/common/thresholds/thresholds_subset.hpp" /** * Defines an interface for all classes that implement a strategy for pruning individual rules based on a "prune set", @@ -22,9 +22,8 @@ class IRulePruning { * pruned by removing individual conditions in a way that improves over its original quality, measured on the * prune set. * - * @param thresholdsSubset A reference to an object of type `IThresholdsSubset`, which contains the thresholds - * that correspond to the subspace of the instance space that is covered by the - * existing rule + * @param featureSubspace A reference to an object of type `IFeatureSubspace` that includes the training + * examples covered by the existing rule * @param partition A reference to an object of type `IPartition` that provides access to the indices of * the training examples that belong to the training set and the holdout set, * respectively @@ -32,12 +31,12 @@ class IRulePruning { * existing rule * @param head A reference to an object of type `IPrediction` that stores the scores that are * predicted by the existing rule - * @return An unique pointer to an object of type `ICoverageState` that keeps track of the + * @return An unique pointer to an object of type `CoverageMask` that keeps track of the * examples that are covered by the pruned rule or a null pointer if the rule was not * pruned */ - virtual std::unique_ptr prune(IThresholdsSubset& thresholdsSubset, IPartition& partition, - ConditionList& conditions, const IPrediction& head) const = 0; + virtual std::unique_ptr prune(IFeatureSubspace& featureSubspace, IPartition& partition, + ConditionList& conditions, const IPrediction& head) const = 0; }; /** diff --git a/cpp/subprojects/common/include/mlrl/common/rule_refinement/coverage_mask.hpp b/cpp/subprojects/common/include/mlrl/common/rule_refinement/coverage_mask.hpp new file mode 100644 index 0000000000..82ade434ce --- /dev/null +++ b/cpp/subprojects/common/include/mlrl/common/rule_refinement/coverage_mask.hpp @@ -0,0 +1,52 @@ +/* + * @author Michael Rapp (michael.rapp.ml@gmail.com) + */ +#pragma once + +#include "mlrl/common/data/vector_dense.hpp" +#include "mlrl/common/util/quality.hpp" + +#include + +// Forward declarations +class IFeatureSubspace; +class SinglePartition; +class BiPartition; +class IPrediction; + +/** + * Allows to check whether individual examples are covered by a rule or not. For each example, an integer is stored in a + * vector that may be updated when the rule is refined. If the value that corresponds to a certain example is equal to + * the "indicator value", it is considered to be covered, otherwise it is not. + */ +class CoverageMask final : public DenseVectorDecorator> { + public: + + /** + * The "indicator value". + */ + uint32 indicatorValue; + + /** + * @param numElements The total number of examples + */ + CoverageMask(uint32 numElements); + + /** + * @param other A reference to an object of type `CoverageMask` to be copied + */ + CoverageMask(const CoverageMask& other); + + /** + * Resets the mask and the "indicator value" such that all examples are marked as covered. + */ + void reset(); + + /** + * Returns whether the example at a specific index is covered or not. + * + * @param index The index of the example + * @return True, if the example at the given index is covered, false otherwise + */ + bool operator[](uint32 index) const; +}; diff --git a/cpp/subprojects/common/include/mlrl/common/rule_refinement/rule_refinement_search.hpp b/cpp/subprojects/common/include/mlrl/common/rule_refinement/feature_based_search.hpp similarity index 77% rename from cpp/subprojects/common/include/mlrl/common/rule_refinement/rule_refinement_search.hpp rename to cpp/subprojects/common/include/mlrl/common/rule_refinement/feature_based_search.hpp index 2d8b407410..d629a517e6 100644 --- a/cpp/subprojects/common/include/mlrl/common/rule_refinement/rule_refinement_search.hpp +++ b/cpp/subprojects/common/include/mlrl/common/rule_refinement/feature_based_search.hpp @@ -4,6 +4,7 @@ #pragma once #include "mlrl/common/input/feature_vector_binary.hpp" +#include "mlrl/common/input/feature_vector_binned.hpp" #include "mlrl/common/input/feature_vector_missing.hpp" #include "mlrl/common/input/feature_vector_nominal.hpp" #include "mlrl/common/input/feature_vector_numerical.hpp" @@ -16,12 +17,12 @@ * Allows to conduct a search for finding the best refinement of an existing rule that can be created from a given * feature vector. */ -class RuleRefinementSearch final { +class FeatureBasedSearch final { public: /** - * Conducts a search for the best refinement of an existing rule that can be created from a given numerical - * feature vector. + * Conducts a search for the best refinement of an existing rule that can be created from a + * `NumericalFeatureVector`. * * @param featureVector A reference to an object of type `NumericalFeatureVector`, the * refinements should be created from @@ -46,8 +47,8 @@ class RuleRefinementSearch final { uint32 minCoverage, Refinement& refinement) const; /** - * Conducts a search for the best refinement of an existing rule that can be created from a given numerical - * feature vector. + * Conducts a search for the best refinement of an existing rule that can be created from a + * `NumericalFeatureVector`. * * @param featureVector A reference to an object of type `NumericalFeatureVector`, the * refinements should be created from @@ -72,8 +73,8 @@ class RuleRefinementSearch final { uint32 minCoverage, Refinement& refinement) const; /** - * Conducts a search for the best refinement of an existing rule that can be created from a given nominal - * feature vector. + * Conducts a search for the best refinement of an existing rule that can be created from a + * `NominalFeatureVector`. * * @param featureVector A reference to an object of type `NominalFeatureVector`, the refinements * should be created from @@ -98,8 +99,8 @@ class RuleRefinementSearch final { uint32 minCoverage, Refinement& refinement) const; /** - * Conducts a search for the best refinement of an existing rule that can be created from a given nominal - * feature vector. + * Conducts a search for the best refinement of an existing rule that can be created from a + * `NominalFeatureVector`. * * @param featureVector A reference to an object of type `NominalFeatureVector`, the refinements * should be created from @@ -124,8 +125,8 @@ class RuleRefinementSearch final { uint32 minCoverage, Refinement& refinement) const; /** - * Conducts a search for the best refinement of an existing rule that can be created from a given binary feature - * vector. + * Conducts a search for the best refinement of an existing rule that can be created from a + * `BinaryFeatureVector`. * * @param featureVector A reference to an object of type `BinaryFeatureVector`, the refinements * should be created from @@ -150,8 +151,8 @@ class RuleRefinementSearch final { uint32 minCoverage, Refinement& refinement) const; /** - * Conducts a search for the best refinement of an existing rule that can be created from a given binary feature - * vector. + * Conducts a search for the best refinement of an existing rule that can be created from a + * `BinaryFeatureVector`. * * @param featureVector A reference to an object of type `BinaryFeatureVector`, the refinements * should be created from @@ -176,8 +177,8 @@ class RuleRefinementSearch final { uint32 minCoverage, Refinement& refinement) const; /** - * Conducts a search for the best refinement of an existing rule that can be created from a given ordinal - * feature vector. + * Conducts a search for the best refinement of an existing rule that can be created from an + * `OrdinalFeatureVector`. * * @param featureVector A reference to an object of type `OrdinalFeatureVector`, the refinements * should be created from @@ -202,8 +203,8 @@ class RuleRefinementSearch final { uint32 minCoverage, Refinement& refinement) const; /** - * Conducts a search for the best refinement of an existing rule that can be created from a given ordinal - * feature vector. + * Conducts a search for the best refinement of an existing rule that can be created from an + * `OrdinalFeatureVector`. * * @param featureVector A reference to an object of type `OrdinalFeatureVector`, the refinements * should be created from @@ -226,4 +227,56 @@ class RuleRefinementSearch final { IWeightedStatisticsSubset& statisticsSubset, FixedRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, Refinement& refinement) const; + + /** + * Conducts a search for the best refinement of an existing rule that can be created from a + * `BinnedFeatureVector`. + * + * @param featureVector A reference to an object of type `BinnedFeatureVector`, the refinements + * should be created from + * @param missingFeatureVector A reference to an object of type `MissingFeatureVector` that provides + * access to the indices of training examples with missing feature values + * @param statisticsSubset A reference to an object of type `IWeightedStatisticsSubset` that + * provides access to weighted statistics about the labels of the training + * examples, which should serve as the basis for evaluating the quality of + * potential refinements + * @param comparator A reference to an object of type `SingleRefinementComparator` that + * should be used for comparing potential refinements + * @param numExamplesWithNonZeroWeights The total number of examples with non-zero weights that may be covered + * by a refinement + * @param minCoverage The minimum number of examples that must be covered by the refinement + * @param refinement A reference to an object of type `Refinement` that should be used for + * storing the properties of the best refinement that is found + */ + void searchForBinnedRefinement(const BinnedFeatureVector& featureVector, + const MissingFeatureVector& missingFeatureVector, + IWeightedStatisticsSubset& statisticsSubset, + SingleRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, + uint32 minCoverage, Refinement& refinement) const; + + /** + * Conducts a search for the best refinement of an existing rule that can be created from a + * `BinnedFeatureVector`. + * + * @param featureVector A reference to an object of type `BinnedFeatureVector`, the refinements + * should be created from + * @param missingFeatureVector A reference to an object of type `MissingFeatureVector` that provides + * access to the indices of training examples with missing feature values + * @param statisticsSubset A reference to an object of type `IWeightedStatisticsSubset` that + * provides access to weighted statistics about the labels of the training + * examples, which should serve as the basis for evaluating the quality of + * potential refinements + * @param comparator A reference to an object of type `MultiRefinementComparator` that should + * be used for comparing potential refinements + * @param numExamplesWithNonZeroWeights The total number of examples with non-zero weights that may be covered + * by a refinement + * @param minCoverage The minimum number of examples that must be covered by the refinements + * @param refinement A reference to an object of type `Refinement` that should be used for + * storing the properties of the best refinement that is found + */ + void searchForBinnedRefinement(const BinnedFeatureVector& featureVector, + const MissingFeatureVector& missingFeatureVector, + IWeightedStatisticsSubset& statisticsSubset, + FixedRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, + uint32 minCoverage, Refinement& refinement) const; }; diff --git a/cpp/subprojects/common/include/mlrl/common/thresholds/thresholds.hpp b/cpp/subprojects/common/include/mlrl/common/rule_refinement/feature_space.hpp similarity index 57% rename from cpp/subprojects/common/include/mlrl/common/thresholds/thresholds.hpp rename to cpp/subprojects/common/include/mlrl/common/rule_refinement/feature_space.hpp index 809c61d063..8d76dfa38b 100644 --- a/cpp/subprojects/common/include/mlrl/common/thresholds/thresholds.hpp +++ b/cpp/subprojects/common/include/mlrl/common/rule_refinement/feature_space.hpp @@ -5,51 +5,50 @@ #include "mlrl/common/input/feature_info.hpp" #include "mlrl/common/input/feature_matrix_column_wise.hpp" +#include "mlrl/common/rule_refinement/feature_subspace.hpp" #include "mlrl/common/sampling/weight_vector_bit.hpp" #include "mlrl/common/sampling/weight_vector_dense.hpp" #include "mlrl/common/sampling/weight_vector_equal.hpp" #include "mlrl/common/statistics/statistics_provider.hpp" -#include "mlrl/common/thresholds/thresholds_subset.hpp" /** - * Defines an interface for all classes that provide access to thresholds that may be used by the first condition of a - * rule that currently has an empty body and therefore covers the entire instance space. + * Defines an interface for all classes that provide access to the feature space. */ -class IThresholds { +class IFeatureSpace { public: - virtual ~IThresholds() {} + virtual ~IFeatureSpace() {} /** - * Creates and returns a new subset of the thresholds, which initially contains all of the thresholds. + * Creates and returns a new subspace of this feature space. * * @param weights A reference to an object of type `EqualWeightVector` that provides access to the weights of * individual training examples - * @return An unique pointer to an object of type `IThresholdsSubset` that has been created + * @return An unique pointer to an object of type `IFeatureSubspace` that has been created */ - virtual std::unique_ptr createSubset(const EqualWeightVector& weights) = 0; + virtual std::unique_ptr createSubspace(const EqualWeightVector& weights) = 0; /** - * Creates and returns a new subset of the thresholds, which initially contains all of the thresholds. + * Creates and returns a new subspace of this feature space. * * @param weights A reference to an object of type `BitWeightVector` that provides access to the weights of * individual training examples - * @return An unique pointer to an object of type `IThresholdsSubset` that has been created + * @return An unique pointer to an object of type `IFeatureSubspace` that has been created */ - virtual std::unique_ptr createSubset(const BitWeightVector& weights) = 0; + virtual std::unique_ptr createSubspace(const BitWeightVector& weights) = 0; /** - * Creates and returns a new subset of the thresholds, which initially contains all of the thresholds. + * Creates and returns a new subspace of this feature space. * - * @param weights A reference to an object of type `DenseWeightVector` that provides access to the - * weights of individual training examples - * @return An unique pointer to an object of type `IThresholdsSubset` that has been created + * @param weights A reference to an object of type `DenseWeightVector` that provides access to the weights of + * individual training examples + * @return An unique pointer to an object of type `IFeatureSubspace` that has been created */ - virtual std::unique_ptr createSubset(const DenseWeightVector& weights) = 0; + virtual std::unique_ptr createSubspace(const DenseWeightVector& weights) = 0; /** * Returns a reference to an object of type `IStatisticsProvider` that provides access to the statistics that - * correspond to individual training examples in the instance space. + * correspond to individual training examples in the feature space. * * @return A reference to an object of type `IStatisticsProvider` */ @@ -57,15 +56,15 @@ class IThresholds { }; /** - * Defines an interface for all classes that allow to create instances of the type `IThresholds`. + * Defines an interface for all classes that allow to create instances of the type `IFeatureSpace`. */ -class IThresholdsFactory { +class IFeatureSpaceFactory { public: - virtual ~IThresholdsFactory() {} + virtual ~IFeatureSpaceFactory() {} /** - * Creates and returns a new object of type `IThresholds`. + * Creates and returns a new object of type `IFeatureSpace`. * * @param featureMatrix A reference to an object of type `IColumnWiseFeatureMatrix` that provides * column-wise access to the feature values of individual training examples @@ -73,9 +72,9 @@ class IThresholdsFactory { * the types of individual features * @param statisticsProvider A reference to an object of type `IStatisticsProvider` that provides access to * statistics about the labels of the training examples - * @return An unique pointer to an object of type `IThresholds` that has been created + * @return An unique pointer to an object of type `IFeatureSpace` that has been created */ - virtual std::unique_ptr create(const IColumnWiseFeatureMatrix& featureMatrix, - const IFeatureInfo& featureInfo, - IStatisticsProvider& statisticsProvider) const = 0; + virtual std::unique_ptr create(const IColumnWiseFeatureMatrix& featureMatrix, + const IFeatureInfo& featureInfo, + IStatisticsProvider& statisticsProvider) const = 0; }; diff --git a/cpp/subprojects/common/include/mlrl/common/rule_refinement/feature_space_tabular.hpp b/cpp/subprojects/common/include/mlrl/common/rule_refinement/feature_space_tabular.hpp new file mode 100644 index 0000000000..6ed98f50ce --- /dev/null +++ b/cpp/subprojects/common/include/mlrl/common/rule_refinement/feature_space_tabular.hpp @@ -0,0 +1,33 @@ +/* + * @author Michael Rapp (michael.rapp.ml@gmail.com) + */ +#pragma once + +#include "mlrl/common/input/feature_binning.hpp" +#include "mlrl/common/rule_refinement/feature_space.hpp" + +/** + * Allows to create objects of type `IFeatureSpace` that provide access to a tabular feature space. + */ +class TabularFeatureSpaceFactory final : public IFeatureSpaceFactory { + private: + + const std::unique_ptr featureBinningFactoryPtr_; + + const uint32 numThreads_; + + public: + + /** + * @param featureBinningFactoryPtr An unique pointer to an object of type `IFeatureBinningFactory` that allows + * to create implementations of the binning method to be used for assigning + * numerical feature values to bins + * @param numThreads The number of CPU threads to be used to update statistics in parallel. Must + * be at least 1 + */ + TabularFeatureSpaceFactory(std::unique_ptr featureBinningFactoryPtr, uint32 numThreads); + + std::unique_ptr create(const IColumnWiseFeatureMatrix& featureMatrix, + const IFeatureInfo& featureInfo, + IStatisticsProvider& statisticsProvider) const override; +}; diff --git a/cpp/subprojects/common/include/mlrl/common/rule_refinement/feature_subspace.hpp b/cpp/subprojects/common/include/mlrl/common/rule_refinement/feature_subspace.hpp new file mode 100644 index 0000000000..b46fa342d1 --- /dev/null +++ b/cpp/subprojects/common/include/mlrl/common/rule_refinement/feature_subspace.hpp @@ -0,0 +1,165 @@ +/* + * @author Michael Rapp (michael.rapp.ml@gmail.com) + */ +#pragma once + +#include "mlrl/common/indices/index_vector_complete.hpp" +#include "mlrl/common/indices/index_vector_partial.hpp" +#include "mlrl/common/model/condition.hpp" +#include "mlrl/common/rule_refinement/coverage_mask.hpp" +#include "mlrl/common/rule_refinement/prediction.hpp" +#include "mlrl/common/rule_refinement/rule_refinement.hpp" +#include "mlrl/common/sampling/partition_bi.hpp" +#include "mlrl/common/sampling/partition_single.hpp" + +#include + +/** + * Defines an interface for all classes that provide access a subspace of the feature space that includes the training + * examples covered by a rule. + */ +class IFeatureSubspace { + public: + + virtual ~IFeatureSubspace() {} + + /** + * Creates and returns a copy of this object. + * + * @return An unique pointer to an object of type `IFeatureSubspace` that has been created + */ + virtual std::unique_ptr copy() const = 0; + + /** + * Creates and returns a new instance of the type `IRuleRefinement` that allows to find the best refinement of + * a rule that covers all examples included in this subspace and predicts for all available labels. + * + * @param labelIndices A reference to an object of type `CompleteIndexVector` that provides access to the + * indices of the labels for which the existing rule predicts + * @param featureIndex The index of the feature that should be considered when searching for refinements + * @return An unique pointer to an object of type `IRuleRefinement` that has been created + */ + virtual std::unique_ptr createRuleRefinement(const CompleteIndexVector& labelIndices, + uint32 featureIndex) = 0; + + /** + * Creates and returns a new instance of the type `IRuleRefinement` that allows to find the best refinement of + * a rule that covers all examples included in this subspace and predicts for a subset of the available labels. + * + * @param labelIndices A reference to an object of type `PartialIndexVector` that provides access to the + * indices of the labels for which the existing rule predicts + * @param featureIndex The index of the feature that should be considered when searching for refinements + * @return An unique pointer to an object of type `IRuleRefinement` that has been created + */ + virtual std::unique_ptr createRuleRefinement(const PartialIndexVector& labelIndices, + uint32 featureIndex) = 0; + + /** + * Filters the subspace such that it only includes those training examples that statisfy a specific condition. + * + * @param condition A reference to an object of type `Condition` + */ + virtual void filterSubspace(const Condition& condition) = 0; + + /** + * Resets the subspace. This reverts the effects of all previous calls to the function `filterSubspace`. + */ + virtual void resetSubspace() = 0; + + /** + * Returns an object of type `CoverageMask` that keeps track of the training examples that are included in this + * subspace. + * + * @return A reference to an object of type `CoverageMask` that keeps track of the training examples that are + * included in this subspace + */ + virtual const CoverageMask& getCoverageMask() const = 0; + + /** + * Calculates and returns a numerical score that assesses the quality of a rule's prediction for all examples + * that do not belong to the current instance sub-sample and are marked as covered according to a given + * `CoverageMask`. + * + * For calculating the quality, only examples that belong to the training set and are not included in the + * current instance sub-sample, i.e., only examples with zero weights, are considered and assigned equally + * distributed weights. + * + * @param partition A reference to an object of type `SinglePartition` that provides access to the indices + * of the training examples that belong to the training set + * @param coverageMask A reference to an object of type `CoverageMask` that keeps track of the examples that + * are covered by the rule + * @param head A reference to an object of type `IPrediction` that stores the scores that are predicted + * by the rule + * @return An object of type `Quality` that stores the calculated quality + */ + virtual Quality evaluateOutOfSample(const SinglePartition& partition, const CoverageMask& coverageMask, + const IPrediction& head) const = 0; + + /** + * Calculates and returns a numerical score that assesses the quality of a rule's prediction for all examples + * that do not belong to the current instance sub-sample and are marked as covered according to a given + * `CoverageMask`. + * + * For calculating the quality, only examples that belong to the training set and are not included in the + * current instance sub-sample, i.e., only examples with zero weights, are considered and assigned equally + * distributed weights. + * + * @param partition A reference to an object of type `BiPartition` that provides access to the indices of + * the training examples that belong to the training set + * @param coverageMask A reference to an object of type `CoverageMask` that keeps track of the examples that + * are covered by the rule + * @param head A reference to an object of type `IPrediction` that stores the scores that are predicted + * by the rule + * @return An object of type `Quality` that stores the calculated quality + */ + virtual Quality evaluateOutOfSample(const BiPartition& partition, const CoverageMask& coverageMask, + const IPrediction& head) const = 0; + + /** + * Recalculates and updates a rule's prediction based on all examples in the training set that are marked as + * covered according to a given `CoverageMask`. + * + * When calculating the updated prediction, the weights of the individual training examples are ignored and + * equally distributed weights are used instead. + * + * @param partition A reference to an object of type `SinglePartition` that provides access to the indices + * of the training examples that belong to the training set + * @param coverageMask A reference to an object of type `CoverageMask` that keeps track of the examples that + * are covered by the rule + * @param head A reference to an object of type `IPrediction` to be updated + */ + virtual void recalculatePrediction(const SinglePartition& partition, const CoverageMask& coverageMask, + IPrediction& head) const = 0; + + /** + * Recalculates and updates a rule's prediction based on all examples in the training set that are marked as + * covered according to a given `CoverageMask`. + * + * When calculating the updated prediction, the weights of the individual training examples are ignored and + * equally distributed weights are used instead. + * + * @param partition A reference to an object of type `BiPartition` that provides access to the indices of + * the training examples that belong to the training set + * @param coverageMask A reference to an object of type `CoverageMask` that keeps track of the examples that + * are covered by the rule + * @param head A reference to an object of type `IPrediction` to be updated + */ + virtual void recalculatePrediction(const BiPartition& partition, const CoverageMask& coverageMask, + IPrediction& head) const = 0; + + /** + * Updates the statistics that correspond to the training examples included in this subspace based on the + * prediction of a rule. + * + * @param prediction A reference to an object of type `IPrediction` that stores the prediction of the rule + */ + virtual void applyPrediction(const IPrediction& prediction) = 0; + + /** + * Reverts the statistics that correspond to the training examples included in this subspace based on the + * predictions of a rule. + * + * @param prediction A reference to an object of type `IPrediction` that stores the prediction of the rule + */ + virtual void revertPrediction(const IPrediction& prediction) = 0; +}; diff --git a/cpp/subprojects/common/include/mlrl/common/rule_refinement/prediction_complete.hpp b/cpp/subprojects/common/include/mlrl/common/rule_refinement/prediction_complete.hpp index 70830ff71a..9b14c90280 100644 --- a/cpp/subprojects/common/include/mlrl/common/rule_refinement/prediction_complete.hpp +++ b/cpp/subprojects/common/include/mlrl/common/rule_refinement/prediction_complete.hpp @@ -113,7 +113,7 @@ class CompletePrediction final : public VectorDecorator const IStatistics& statistics, const OutOfSampleWeightVector>& weights) const override; - std::unique_ptr createRuleRefinement(IThresholdsSubset& thresholdsSubset, + std::unique_ptr createRuleRefinement(IFeatureSubspace& featureSubspace, uint32 featureIndex) const override; void apply(IStatistics& statistics, uint32 statisticIndex) const override; diff --git a/cpp/subprojects/common/include/mlrl/common/rule_refinement/prediction_partial.hpp b/cpp/subprojects/common/include/mlrl/common/rule_refinement/prediction_partial.hpp index 986b6448da..a83b9a84a2 100644 --- a/cpp/subprojects/common/include/mlrl/common/rule_refinement/prediction_partial.hpp +++ b/cpp/subprojects/common/include/mlrl/common/rule_refinement/prediction_partial.hpp @@ -147,7 +147,7 @@ class PartialPrediction final : public ResizableVectorDecorator>& weights) const override; - std::unique_ptr createRuleRefinement(IThresholdsSubset& thresholdsSubset, + std::unique_ptr createRuleRefinement(IFeatureSubspace& featureSubspace, uint32 featureIndex) const override; void apply(IStatistics& statistics, uint32 statisticIndex) const override; diff --git a/cpp/subprojects/common/include/mlrl/common/rule_refinement/rule_refinement.hpp b/cpp/subprojects/common/include/mlrl/common/rule_refinement/rule_refinement.hpp index 8b122c3295..a784229524 100644 --- a/cpp/subprojects/common/include/mlrl/common/rule_refinement/rule_refinement.hpp +++ b/cpp/subprojects/common/include/mlrl/common/rule_refinement/rule_refinement.hpp @@ -3,8 +3,10 @@ */ #pragma once +#include "mlrl/common/input/feature_vector.hpp" #include "mlrl/common/rule_refinement/refinement_comparator_fixed.hpp" #include "mlrl/common/rule_refinement/refinement_comparator_single.hpp" +#include "mlrl/common/statistics/statistics_weighted_immutable.hpp" /** * Defines an interface for all classes that allow to find the best refinement of existing rules. @@ -12,6 +14,54 @@ class IRuleRefinement { public: + /** + * Defines an interface for callbacks that may be invoked by subclasses of the the class `IRuleRefinement` in + * order to retrieve the information that is required to search for potential refinements. It consists of + * `IImmutableWeightedStatistics`, as well as an `IFeatureVector` that allows to determine the thresholds that + * may be used by potential conditions. + */ + class ICallback { + public: + + /** + * The data that is provided via the callback's `get` function. + */ + struct Result final { + public: + + /** + * @param statistics A reference to an object of type `IImmutableWeightedStatistics` that + * should be used to search for potential refinements + * @param featureVector A reference to an object of type `IFeatureVector` that should be used to + * search for potential refinements + */ + Result(const IImmutableWeightedStatistics& statistics, const IFeatureVector& featureVector) + : statistics(statistics), featureVector(featureVector) {} + + /** + * A reference to an object of type `IImmutableWeightedStatistics` that should be used to search + * for potential refinements. + */ + const IImmutableWeightedStatistics& statistics; + + /** + * A reference to an object of type `IFeatureVector` that should be used to search for potential + * refinements. + */ + const IFeatureVector& featureVector; + }; + + virtual ~ICallback() {} + + /** + * Invokes the callback and returns its result. + * + * @return An object of type `Result` that stores references to the statistics and the feature vector + * that may be used to search for potential refinements + */ + virtual Result get() = 0; + }; + virtual ~IRuleRefinement() {} /** @@ -21,7 +71,7 @@ class IRuleRefinement { * comparing potential refinements * @param minCoverage The minimum number of examples that must be covered by the refinement */ - virtual void findRefinement(SingleRefinementComparator& comparator, uint32 minCoverage) = 0; + virtual void findRefinement(SingleRefinementComparator& comparator, uint32 minCoverage) const = 0; /** * Finds the best refinements of an existing rule. @@ -30,5 +80,5 @@ class IRuleRefinement { * comparing potential refinements * @param minCoverage The minimum number of examples that must be covered by the refinements */ - virtual void findRefinement(FixedRefinementComparator& comparator, uint32 minCoverage) = 0; + virtual void findRefinement(FixedRefinementComparator& comparator, uint32 minCoverage) const = 0; }; diff --git a/cpp/subprojects/common/include/mlrl/common/rule_refinement/rule_refinement_approximate.hpp b/cpp/subprojects/common/include/mlrl/common/rule_refinement/rule_refinement_approximate.hpp deleted file mode 100644 index 48d608c20d..0000000000 --- a/cpp/subprojects/common/include/mlrl/common/rule_refinement/rule_refinement_approximate.hpp +++ /dev/null @@ -1,57 +0,0 @@ -/* - * @author Lukas Johannes Eberle (lukasjohannes.eberle@stud.tu-darmstadt.de) - * @author Michael Rapp (michael.rapp.ml@gmail.com) - */ -#pragma once - -#include "mlrl/common/binning/threshold_vector.hpp" -#include "mlrl/common/rule_refinement/rule_refinement.hpp" -#include "mlrl/common/rule_refinement/rule_refinement_callback.hpp" -#include "mlrl/common/statistics/histogram.hpp" - -/** - * Allows to find the best refinements of existing rules, which result from adding a new condition that correspond to a - * certain feature. The thresholds that may be used by the new condition result from the boundaries between the bins - * that have been created using a binning method. - * - * @tparam IndexVector The type of the vector that provides access to the indices of the labels for which the refined - * rule is allowed to predict - */ -template -class ApproximateRuleRefinement final : public IRuleRefinement { - private: - - const IndexVector& labelIndices_; - - const uint32 numExamples_; - - const uint32 featureIndex_; - - const bool ordinal_; - - const bool nominal_; - - typedef IRuleRefinementCallback Callback; - - const std::unique_ptr callbackPtr_; - - public: - - /** - * @param labelIndices A reference to an object of template type `IndexVector` that provides access to the - * indices of the labels for which the refined rule is allowed to predict - * @param numExamples The total number of training examples with non-zero weights that are covered by the - * existing rule - * @param featureIndex The index of the feature, the new condition corresponds to - * @param ordinal True, if the feature at index `featureIndex` is ordinal, false otherwise - * @param nominal True, if the feature at index `featureIndex` is nominal, false otherwise - * @param callbackPtr An unique pointer to an object of type `IRuleRefinementCallback` that allows to - * retrieve the information that is required to search for potential refinements - */ - ApproximateRuleRefinement(const IndexVector& labelIndices, uint32 numExamples, uint32 featureIndex, - bool ordinal, bool nominal, std::unique_ptr callbackPtr); - - void findRefinement(SingleRefinementComparator& comparator, uint32 minCoverage) override; - - void findRefinement(FixedRefinementComparator& comparator, uint32 minCoverage) override; -}; diff --git a/cpp/subprojects/common/include/mlrl/common/rule_refinement/rule_refinement_callback.hpp b/cpp/subprojects/common/include/mlrl/common/rule_refinement/rule_refinement_callback.hpp deleted file mode 100644 index 3f8bb1e8d0..0000000000 --- a/cpp/subprojects/common/include/mlrl/common/rule_refinement/rule_refinement_callback.hpp +++ /dev/null @@ -1,55 +0,0 @@ -/* - * @author Michael Rapp (michael.rapp.ml@gmail.com) - */ -#pragma once - -/** - * Defines an interface for callbacks that may be invoked by subclasses of the the class `IRuleRefinement` in order to - * retrieve the information that is required to search for potential refinements. It consists of statistics, as well as - * a vector that allows to determine the thresholds that may be used by potential conditions. - * - * @tparam Statistics The type of the statistics, - * @tparam Vector The type of the vector that is returned by the callback - */ -// TODO Remove template argument Vector and use IFeatureVector instead -template -class IRuleRefinementCallback { - public: - - /** - * The data that is provided via the callback's `get` function. - */ - struct Result final { - public: - - /** - * @param statistics A reference to an object of template type `Statistics` that should be used to - * search for potential refinements - * @param vector A reference to an object of template type `Vector` that should be used to search - * for potential refinements - */ - Result(const Statistics& statistics, const Vector& vector) : statistics(statistics), vector(vector) {} - - /** - * A reference to an object of template type `Statistics` that should be used to search for potential - * refinements. - */ - const Statistics& statistics; - - /** - * A reference to an object of template type `Vector` that should be used to search for potential - * refinements. - */ - const Vector& vector; - }; - - virtual ~IRuleRefinementCallback() {} - - /** - * Invokes the callback and returns its result. - * - * @return An object of type `Result` that stores references to the statistics and the vector that may be used - * to search for potential refinements - */ - virtual Result get() = 0; -}; diff --git a/cpp/subprojects/common/include/mlrl/common/rule_refinement/rule_refinement_exact.hpp b/cpp/subprojects/common/include/mlrl/common/rule_refinement/rule_refinement_feature_based.hpp similarity index 74% rename from cpp/subprojects/common/include/mlrl/common/rule_refinement/rule_refinement_exact.hpp rename to cpp/subprojects/common/include/mlrl/common/rule_refinement/rule_refinement_feature_based.hpp index d95bb67a12..c67f0bd3df 100644 --- a/cpp/subprojects/common/include/mlrl/common/rule_refinement/rule_refinement_exact.hpp +++ b/cpp/subprojects/common/include/mlrl/common/rule_refinement/rule_refinement_feature_based.hpp @@ -3,9 +3,7 @@ */ #pragma once -#include "mlrl/common/input/feature_vector.hpp" #include "mlrl/common/rule_refinement/rule_refinement.hpp" -#include "mlrl/common/rule_refinement/rule_refinement_callback.hpp" #include "mlrl/common/statistics/statistics_weighted.hpp" /** @@ -17,7 +15,7 @@ * rule is allowed to predict */ template -class ExactRuleRefinement final : public IRuleRefinement { +class FeatureBasedRuleRefinement final : public IRuleRefinement { private: const IndexVector& labelIndices_; @@ -26,9 +24,7 @@ class ExactRuleRefinement final : public IRuleRefinement { const uint32 numExamplesWithNonZeroWeights_; - typedef IRuleRefinementCallback Callback; - - const std::unique_ptr callbackPtr_; + const std::unique_ptr callbackPtr_; public: @@ -39,14 +35,15 @@ class ExactRuleRefinement final : public IRuleRefinement { * @param featureIndex The index of the feature, the new condition corresponds to * @param numExamplesWithNonZeroWeights The total number of examples with non-zero weights that may be covered * by a refinement - * @param callbackPtr An unique pointer to an object of type `IRuleRefinementCallback` that + * @param callbackPtr An unique pointer to an object of type `IRuleRefinement::ICallback` that * allows to retrieve the information that is required to search for * potential refinements */ - ExactRuleRefinement(const IndexVector& labelIndices, uint32 featureIndex, uint32 numExamplesWithNonZeroWeights, - std::unique_ptr callbackPtr); + FeatureBasedRuleRefinement(const IndexVector& labelIndices, uint32 featureIndex, + uint32 numExamplesWithNonZeroWeights, + std::unique_ptr callbackPtr); - void findRefinement(SingleRefinementComparator& comparator, uint32 minCoverage) override; + void findRefinement(SingleRefinementComparator& comparator, uint32 minCoverage) const override; - void findRefinement(FixedRefinementComparator& comparator, uint32 minCoverage) override; + void findRefinement(FixedRefinementComparator& comparator, uint32 minCoverage) const override; }; diff --git a/cpp/subprojects/common/include/mlrl/common/sampling/partition.hpp b/cpp/subprojects/common/include/mlrl/common/sampling/partition.hpp index 2c0a9ae57f..a44bcd9fcf 100644 --- a/cpp/subprojects/common/include/mlrl/common/sampling/partition.hpp +++ b/cpp/subprojects/common/include/mlrl/common/sampling/partition.hpp @@ -3,6 +3,7 @@ */ #pragma once +#include "mlrl/common/rule_refinement/coverage_mask.hpp" #include "mlrl/common/util/quality.hpp" #include @@ -14,8 +15,7 @@ class IInstanceSampling; class IInstanceSamplingFactory; class IRowWiseLabelMatrix; class IStatistics; -class IThresholdsSubset; -class ICoverageState; +class IFeatureSubspace; class IPrediction; class IMarginalProbabilityCalibrationModel; class IMarginalProbabilityCalibrator; @@ -59,31 +59,31 @@ class IPartition { /** * Calculates and returns a numerical score that assesses the quality of a rule's prediction for all examples * that do not belong to the current sample and are marked as covered according to a given object of type - * `ICoverageState`. + * `CoverageMask`. * - * @param thresholdsSubset A reference to an object of type `IThresholdsSubset` that should be used to - * evaluate the prediction - * @param coverageState A reference to an object of type `ICoverageState` that keeps track of the examples - * that are covered by the rule - * @param head A reference to an object of type `IPrediction` that stores the scores that are - * predicted by the rule - * @return An object of type `Quality` that stores the calculated quality + * @param featureSubspace A reference to an object of type `IFeatureSubspace` that should be used to evaluate + * the prediction + * @param coverageMask A reference to an object of type `CoverageMask` that keeps track of the examples that + * are covered by the rule + * @param head A reference to an object of type `IPrediction` that stores the scores that are + * predicted by the rule + * @return An object of type `Quality` that stores the calculated quality */ - virtual Quality evaluateOutOfSample(const IThresholdsSubset& thresholdsSubset, - const ICoverageState& coverageState, const IPrediction& head) = 0; + virtual Quality evaluateOutOfSample(const IFeatureSubspace& featureSubspace, const CoverageMask& coverageMask, + const IPrediction& head) = 0; /** * Recalculates and updates a rule's prediction based on all examples in the training set that are marked as - * covered according to a given object of type `ICoverageState`. + * covered according to a given object of type `CoverageMask`. * - * @param thresholdsSubset A reference to an object of type `IThresholdsSubset` that should be used to - * recalculate the prediction - * @param coverageState A reference to an object of type `ICoverageState` that keeps track of the examples - * that are covered by the rule - * @param head A reference to an object of type `IPrediction` to be updated + * @param featureSubspace A reference to an object of type `IFeatureSubspace` that should be used to recalculate + * the prediction + * @param coverageMask A reference to an object of type `CoverageMask` that keeps track of the examples that + * are covered by the rule + * @param head A reference to an object of type `IPrediction` to be updated */ - virtual void recalculatePrediction(const IThresholdsSubset& thresholdsSubset, - const ICoverageState& coverageState, IPrediction& head) = 0; + virtual void recalculatePrediction(const IFeatureSubspace& featureSubspace, const CoverageMask& coverageMask, + IPrediction& head) = 0; /** * Fits and returns a model for the calibration of marginal probabilities, based on the type of this partition. diff --git a/cpp/subprojects/common/include/mlrl/common/sampling/partition_bi.hpp b/cpp/subprojects/common/include/mlrl/common/sampling/partition_bi.hpp index 2b11c52161..0690f726ff 100644 --- a/cpp/subprojects/common/include/mlrl/common/sampling/partition_bi.hpp +++ b/cpp/subprojects/common/include/mlrl/common/sampling/partition_bi.hpp @@ -126,10 +126,10 @@ class BiPartition final : public VectorDecorator>, const IRowWiseLabelMatrix& labelMatrix, IStatistics& statistics) override; - Quality evaluateOutOfSample(const IThresholdsSubset& thresholdsSubset, const ICoverageState& coverageState, + Quality evaluateOutOfSample(const IFeatureSubspace& featureSubspace, const CoverageMask& coverageMask, const IPrediction& head) override; - void recalculatePrediction(const IThresholdsSubset& thresholdsSubset, const ICoverageState& coverageState, + void recalculatePrediction(const IFeatureSubspace& featureSubspace, const CoverageMask& coverageMask, IPrediction& head) override; std::unique_ptr fitMarginalProbabilityCalibrationModel( diff --git a/cpp/subprojects/common/include/mlrl/common/sampling/partition_single.hpp b/cpp/subprojects/common/include/mlrl/common/sampling/partition_single.hpp index 9647f152a0..78ff3a7cde 100644 --- a/cpp/subprojects/common/include/mlrl/common/sampling/partition_single.hpp +++ b/cpp/subprojects/common/include/mlrl/common/sampling/partition_single.hpp @@ -54,10 +54,10 @@ class SinglePartition final : public IPartition { const IRowWiseLabelMatrix& labelMatrix, IStatistics& statistics) override; - Quality evaluateOutOfSample(const IThresholdsSubset& thresholdsSubset, const ICoverageState& coverageState, + Quality evaluateOutOfSample(const IFeatureSubspace& featureSubspace, const CoverageMask& coverageMask, const IPrediction& head) override; - void recalculatePrediction(const IThresholdsSubset& thresholdsSubset, const ICoverageState& coverageState, + void recalculatePrediction(const IFeatureSubspace& featureSubspace, const CoverageMask& coverageMask, IPrediction& head) override; std::unique_ptr fitMarginalProbabilityCalibrationModel( diff --git a/cpp/subprojects/common/include/mlrl/common/sampling/weight_vector.hpp b/cpp/subprojects/common/include/mlrl/common/sampling/weight_vector.hpp index 5f58828aba..6a1084bfb1 100644 --- a/cpp/subprojects/common/include/mlrl/common/sampling/weight_vector.hpp +++ b/cpp/subprojects/common/include/mlrl/common/sampling/weight_vector.hpp @@ -8,8 +8,8 @@ #include // Forward declarations -class IThresholds; -class IThresholdsSubset; +class IFeatureSpace; +class IFeatureSubspace; /** * Defines an interface for one-dimensional vectors that provide access to weights. @@ -27,12 +27,12 @@ class IWeightVector { virtual bool hasZeroWeights() const = 0; /** - * Creates and returns a new instance of type `IThresholdsSubset` that provides access to the statistics that - * correspond to individual training examples whose weights are stored in this vector. + * Creates and returns a new instance of type `IFeatureSubspace` that uses the weights in this vector for the + * training examples it includes. * - * @param thresholds A reference to an object of type `IThresholds` that should be used to create the + * @param featureSpace A reference to an object of type `IFeatureSpace` that should be used to create the * instance - * @return An unique pointer to an object of type `IThresholdsSubset` that has been created + * @return An unique pointer to an object of type `IFeatureSubspace` that has been created */ - virtual std::unique_ptr createThresholdsSubset(IThresholds& thresholds) const = 0; + virtual std::unique_ptr createFeatureSubspace(IFeatureSpace& featureSpace) const = 0; }; diff --git a/cpp/subprojects/common/include/mlrl/common/sampling/weight_vector_bit.hpp b/cpp/subprojects/common/include/mlrl/common/sampling/weight_vector_bit.hpp index 8699c6e34b..d134b85c1f 100644 --- a/cpp/subprojects/common/include/mlrl/common/sampling/weight_vector_bit.hpp +++ b/cpp/subprojects/common/include/mlrl/common/sampling/weight_vector_bit.hpp @@ -68,5 +68,5 @@ class BitWeightVector final : public IWeightVector { bool hasZeroWeights() const override; - std::unique_ptr createThresholdsSubset(IThresholds& thresholds) const override; + std::unique_ptr createFeatureSubspace(IFeatureSpace& featureSpace) const override; }; diff --git a/cpp/subprojects/common/include/mlrl/common/sampling/weight_vector_dense.hpp b/cpp/subprojects/common/include/mlrl/common/sampling/weight_vector_dense.hpp index a4a807f95c..ba96277de2 100644 --- a/cpp/subprojects/common/include/mlrl/common/sampling/weight_vector_dense.hpp +++ b/cpp/subprojects/common/include/mlrl/common/sampling/weight_vector_dense.hpp @@ -42,5 +42,5 @@ class DenseWeightVector final : public DenseVectorDecorator>, bool hasZeroWeights() const override; - std::unique_ptr createThresholdsSubset(IThresholds& thresholds) const override; + std::unique_ptr createFeatureSubspace(IFeatureSpace& featureSpace) const override; }; diff --git a/cpp/subprojects/common/include/mlrl/common/sampling/weight_vector_equal.hpp b/cpp/subprojects/common/include/mlrl/common/sampling/weight_vector_equal.hpp index 8c929324ed..a1e28c473b 100644 --- a/cpp/subprojects/common/include/mlrl/common/sampling/weight_vector_equal.hpp +++ b/cpp/subprojects/common/include/mlrl/common/sampling/weight_vector_equal.hpp @@ -44,5 +44,5 @@ class EqualWeightVector final : public IWeightVector { bool hasZeroWeights() const override; - std::unique_ptr createThresholdsSubset(IThresholds& thresholds) const override; + std::unique_ptr createFeatureSubspace(IFeatureSpace& featureSpace) const override; }; diff --git a/cpp/subprojects/common/include/mlrl/common/statistics/histogram.hpp b/cpp/subprojects/common/include/mlrl/common/statistics/histogram.hpp deleted file mode 100644 index 865ce0855b..0000000000 --- a/cpp/subprojects/common/include/mlrl/common/statistics/histogram.hpp +++ /dev/null @@ -1,37 +0,0 @@ -/* - * @author Michael Rapp (michael.rapp.ml@gmail.com) - */ -#pragma once - -#include "mlrl/common/statistics/statistics_weighted_immutable.hpp" - -/** - * Defines an interface for all classes that provide access to statistics that are organized as a histogram, i.e., where - * the statistics of multiple training examples are aggregated into the same bin. - */ -class IHistogram : virtual public IImmutableWeightedStatistics { - public: - - virtual ~IHistogram() override {} - - /** - * Sets all statistics in the histogram to zero. - */ - virtual void clear() = 0; - - /** - * Returns the weight of the bin at a specific index, i.e., the number of statistics that have been assigned to - * it. - * - * @param binIndex The index of the bin - * @return The weight of the bin - */ - virtual uint32 getBinWeight(uint32 binIndex) const = 0; - - /** - * Adds the statistic at a specific index to the corresponding bin. - * - * @param statisticIndex The index of the statistic - */ - virtual void addToBin(uint32 statisticIndex) = 0; -}; diff --git a/cpp/subprojects/common/include/mlrl/common/statistics/statistics_weighted.hpp b/cpp/subprojects/common/include/mlrl/common/statistics/statistics_weighted.hpp index ceba9ded25..33385fb76c 100644 --- a/cpp/subprojects/common/include/mlrl/common/statistics/statistics_weighted.hpp +++ b/cpp/subprojects/common/include/mlrl/common/statistics/statistics_weighted.hpp @@ -3,9 +3,6 @@ */ #pragma once -#include "mlrl/common/binning/bin_index_vector_dense.hpp" -#include "mlrl/common/binning/bin_index_vector_dok.hpp" -#include "mlrl/common/statistics/histogram.hpp" #include "mlrl/common/statistics/statistics_weighted_immutable.hpp" /** @@ -66,26 +63,4 @@ class IWeightedStatistics : virtual public IImmutableWeightedStatistics { * @param statisticIndex The index of the statistic that should be removed */ virtual void removeCoveredStatistic(uint32 statisticIndex) = 0; - - /** - * Creates and returns a new histogram based on the statistics. - * - * @param binIndexVector A reference to an object of type `DenseBinIndexVector` that stores the indices of - * the bins, individual examples have been assigned to - * @param numBins The number of bins in the histogram - * @return An unique pointer to an object of type `IHistogram` that has been created - */ - virtual std::unique_ptr createHistogram(const DenseBinIndexVector& binIndexVector, - uint32 numBins) const = 0; - - /** - * Creates and returns a new histogram based on the statistics. - * - * @param binIndexVector A reference to an object of type `DokBinIndexVector` that stores the indices of the - * bins, individual examples have been assigned to - * @param numBins The number of bins in the histogram - * @return An unique pointer to an object of type `IHistogram` that has been created - */ - virtual std::unique_ptr createHistogram(const DokBinIndexVector& binIndexVector, - uint32 numBins) const = 0; }; diff --git a/cpp/subprojects/common/include/mlrl/common/thresholds/coverage_mask.hpp b/cpp/subprojects/common/include/mlrl/common/thresholds/coverage_mask.hpp deleted file mode 100644 index 534a0dff2c..0000000000 --- a/cpp/subprojects/common/include/mlrl/common/thresholds/coverage_mask.hpp +++ /dev/null @@ -1,73 +0,0 @@ -/* - * @author Michael Rapp (michael.rapp.ml@gmail.com) - */ -#pragma once - -#include "mlrl/common/data/vector_dense.hpp" -#include "mlrl/common/thresholds/coverage_state.hpp" - -/** - * Allows to check whether individual examples are covered by a rule or not. For each example, an integer is stored in a - * C-contiguous array that may be updated when the rule is refined. If the value that corresponds to a certain example - * is equal to the "indicator value", it is considered to be covered. - */ -// TODO: Delete base class and move into directory "data" -class CoverageMask final : public DenseVectorDecorator>, - public ICoverageState { - private: - - uint32 indicatorValue_; - - public: - - /** - * @param numElements The total number of examples - */ - CoverageMask(uint32 numElements); - - /** - * @param other A reference to an object of type `CoverageMask` to be copied - */ - CoverageMask(const CoverageMask& other); - - /** - * Returns the "indicator value". - * - * @return The "indicator value" - */ - uint32 getIndicatorValue() const; - - /** - * Sets the "indicator value". - * - * @param indicatorValue The "indicator value" to be set - */ - void setIndicatorValue(uint32 indicatorValue); - - /** - * Resets the mask and the "indicator value" such that all examples are marked as covered. - */ - void reset(); - - /** - * Returns whether the example at a specific index is covered or not. - * - * @param pos The index of the example - * @return True, if the example at the given index is covered, false otherwise - */ - bool isCovered(uint32 pos) const; - - std::unique_ptr copy() const override; - - Quality evaluateOutOfSample(const IThresholdsSubset& thresholdsSubset, const SinglePartition& partition, - const IPrediction& head) const override; - - Quality evaluateOutOfSample(const IThresholdsSubset& thresholdsSubset, BiPartition& partition, - const IPrediction& head) const override; - - void recalculatePrediction(const IThresholdsSubset& thresholdsSubset, const SinglePartition& partition, - IPrediction& head) const override; - - void recalculatePrediction(const IThresholdsSubset& thresholdsSubset, BiPartition& partition, - IPrediction& head) const override; -}; diff --git a/cpp/subprojects/common/include/mlrl/common/thresholds/coverage_set.hpp b/cpp/subprojects/common/include/mlrl/common/thresholds/coverage_set.hpp deleted file mode 100644 index b642ef6fc2..0000000000 --- a/cpp/subprojects/common/include/mlrl/common/thresholds/coverage_set.hpp +++ /dev/null @@ -1,63 +0,0 @@ -/* - * @author Michael Rapp (michael.rapp.ml@gmail.com) - */ -#pragma once - -#include "mlrl/common/data/vector_dense.hpp" -#include "mlrl/common/thresholds/coverage_state.hpp" - -/** - * Provides access to the indices of the examples that are covered by a rule. The indices of the covered examples are - * stored in a C-contiguous array that may be updated when the rule is refined. - */ -class CoverageSet final : public DenseVectorDecorator>, - public ICoverageState { - private: - - uint32 numCovered_; - - public: - - /** - * @param numElements The total number of examples - */ - CoverageSet(uint32 numElements); - - /** - * @param other A reference to an object of type `CoverageSet` to be copied - */ - CoverageSet(const CoverageSet& other); - - /** - * Returns the number of covered examples. - * - * @return The number of covered examples - */ - uint32 getNumCovered() const; - - /** - * Sets the number of covered examples. - * - * @param numCovered The number of covered examples to be set - */ - void setNumCovered(uint32 numCovered); - - /** - * Resets the number of covered examples and their indices such that all examples are marked as covered. - */ - void reset(); - - std::unique_ptr copy() const override; - - Quality evaluateOutOfSample(const IThresholdsSubset& thresholdsSubset, const SinglePartition& partition, - const IPrediction& head) const override; - - Quality evaluateOutOfSample(const IThresholdsSubset& thresholdsSubset, BiPartition& partition, - const IPrediction& head) const override; - - void recalculatePrediction(const IThresholdsSubset& thresholdsSubset, const SinglePartition& partition, - IPrediction& head) const override; - - void recalculatePrediction(const IThresholdsSubset& thresholdsSubset, BiPartition& partition, - IPrediction& head) const override; -}; diff --git a/cpp/subprojects/common/include/mlrl/common/thresholds/coverage_state.hpp b/cpp/subprojects/common/include/mlrl/common/thresholds/coverage_state.hpp deleted file mode 100644 index 5af4578749..0000000000 --- a/cpp/subprojects/common/include/mlrl/common/thresholds/coverage_state.hpp +++ /dev/null @@ -1,86 +0,0 @@ -/* - * @author Michael Rapp (michael.rapp.ml@gmail.com) - */ -#pragma once - -#include "mlrl/common/util/quality.hpp" - -#include - -// Forward declarations -class IThresholdsSubset; -class SinglePartition; -class BiPartition; -class IPrediction; - -/** - * Defines an interface for all classes that allow to keep track of the examples that are covered by a rule. - */ -class ICoverageState { - public: - - virtual ~ICoverageState() {} - - /** - * Creates and returns a deep copy of the coverage state. - * - * @return An unique pointer to an object of type `ICoverageState` that has been created - */ - virtual std::unique_ptr copy() const = 0; - - /** - * Calculates and returns a numerical score that assesses the quality of a rule's prediction for all examples - * that do not belong to the current sub-sample and are marked as covered. - * - * @param thresholdsSubset A reference to an object of type `IThresholdsSubset` that should be used to - * evaluate the prediction - * @param partition A reference to an object of type `SinglePartition` that provides access to the - * indices of the training examples that belong to the training set - * @param head A reference to an object of type `IPrediction` that stores the scores that are - * predicted by the rule - * @return An object of type `Quality` that stores the calculated quality - */ - virtual Quality evaluateOutOfSample(const IThresholdsSubset& thresholdsSubset, const SinglePartition& partition, - const IPrediction& head) const = 0; - - /** - * Calculates and returns a numerical score that assesses the quality of a rule's prediction for all examples - * that do not belong to the current sub-sample and are marked as covered. - * - * @param thresholdsSubset A reference to an object of type `IThresholdsSubset` that should be used to - * evaluate the prediction - * @param partition A reference to an object of type `BiPartition` that provides access to the indices - * of the training examples that belong to the training set - * @param head A reference to an object of type `IPrediction` that stores the scores that are - * predicted by the rule - * @return An object of type `Quality` that stores the calculated quality - */ - virtual Quality evaluateOutOfSample(const IThresholdsSubset& thresholdsSubset, BiPartition& partition, - const IPrediction& head) const = 0; - - /** - * Recalculates and updates a rule's prediction based on all examples in the training set that are marked as - * covered. - * - * @param thresholdsSubset A reference to an object of type `IThresholdsSubset` that should be used to - * recalculate the prediction - * @param partition A reference to an object of type `SinglePartition` that provides access to the - * indices of the training examples that belong to the training set - * @param head A reference to an object of type `IPrediction` to be updated - */ - virtual void recalculatePrediction(const IThresholdsSubset& thresholdsSubset, const SinglePartition& partition, - IPrediction& head) const = 0; - - /** - * Recalculates and updates a rule's prediction based on all examples in the training set that are marked as - * covered. - * - * @param thresholdsSubset A reference to an object of type `IThresholdsSubset` that should be used to - * recalculate the prediction - * @param partition A reference to an object of type `BiPartition` that provides access to the indices - * of the training examples that belong to the training set - * @param head A reference to an object of type `IPrediction` to be updated - */ - virtual void recalculatePrediction(const IThresholdsSubset& thresholdsSubset, BiPartition& partition, - IPrediction& head) const = 0; -}; diff --git a/cpp/subprojects/common/include/mlrl/common/thresholds/thresholds_approximate.hpp b/cpp/subprojects/common/include/mlrl/common/thresholds/thresholds_approximate.hpp deleted file mode 100644 index 14b6082141..0000000000 --- a/cpp/subprojects/common/include/mlrl/common/thresholds/thresholds_approximate.hpp +++ /dev/null @@ -1,41 +0,0 @@ -/* - * @author Lukas Johannes Eberle (lukasjohannes.eberle@stud.tu-darmstadt.de) - * @author Michael Rapp (michael.rapp.ml@gmail.com) - */ -#pragma once - -#include "mlrl/common/binning/feature_binning.hpp" -#include "mlrl/common/thresholds/thresholds.hpp" - -/** - * A factory that allows to create instances of the type `ApproximateThresholds`. - */ -class ApproximateThresholdsFactory final : public IThresholdsFactory { - private: - - const std::unique_ptr numericalFeatureBinningFactoryPtr_; - - const std::unique_ptr nominalFeatureBinningFactoryPtr_; - - const uint32 numThreads_; - - public: - - /** - * @param numericalFeatureBinningFactoryPtr An unique pointer to an object of type `IFeatureBinningFactory` that - * allows to create implementations of the binning method to be used - * for assigning numerical feature values to bins - * @param nominalFeatureBinningFactoryPtr An unique pointer to an object of type `IFeatureBinningFactory` that - * allows to create implementations of the binning method to be used - * for assigning nominal feature values to bins - * @param numThreads The number of CPU threads to be used to update statistics in - * parallel. Must be at least 1 - */ - ApproximateThresholdsFactory(std::unique_ptr numericalFeatureBinningFactoryPtr, - std::unique_ptr nominalFeatureBinningFactoryPtr, - uint32 numThreads); - - std::unique_ptr create(const IColumnWiseFeatureMatrix& featureMatrix, - const IFeatureInfo& featureInfo, - IStatisticsProvider& statisticsProvider) const override; -}; diff --git a/cpp/subprojects/common/include/mlrl/common/thresholds/thresholds_exact.hpp b/cpp/subprojects/common/include/mlrl/common/thresholds/thresholds_exact.hpp deleted file mode 100644 index 9cba03ce30..0000000000 --- a/cpp/subprojects/common/include/mlrl/common/thresholds/thresholds_exact.hpp +++ /dev/null @@ -1,26 +0,0 @@ -/* - * @author Michael Rapp (michael.rapp.ml@gmail.com) - */ -#pragma once - -#include "mlrl/common/thresholds/thresholds.hpp" - -/** - * A factory that allows to create instances of the type `ExactThresholds`. - */ -class ExactThresholdsFactory final : public IThresholdsFactory { - private: - - const uint32 numThreads_; - - public: - - /** - * @param numThreads The number of CPU threads to be used to update statistics in parallel. Must be at least 1 - */ - ExactThresholdsFactory(uint32 numThreads); - - std::unique_ptr create(const IColumnWiseFeatureMatrix& featureMatrix, - const IFeatureInfo& featureInfo, - IStatisticsProvider& statisticsProvider) const override; -}; diff --git a/cpp/subprojects/common/include/mlrl/common/thresholds/thresholds_subset.hpp b/cpp/subprojects/common/include/mlrl/common/thresholds/thresholds_subset.hpp deleted file mode 100644 index c1c4c41fb8..0000000000 --- a/cpp/subprojects/common/include/mlrl/common/thresholds/thresholds_subset.hpp +++ /dev/null @@ -1,235 +0,0 @@ -/* - * @author Michael Rapp (michael.rapp.ml@gmail.com) - */ -#pragma once - -#include "mlrl/common/indices/index_vector_complete.hpp" -#include "mlrl/common/indices/index_vector_partial.hpp" -#include "mlrl/common/model/condition.hpp" -#include "mlrl/common/rule_refinement/prediction.hpp" -#include "mlrl/common/rule_refinement/rule_refinement.hpp" -#include "mlrl/common/sampling/partition_bi.hpp" -#include "mlrl/common/sampling/partition_single.hpp" -#include "mlrl/common/thresholds/coverage_mask.hpp" -#include "mlrl/common/thresholds/coverage_set.hpp" - -#include - -/** - * Defines an interface for all classes that provide access a subset of thresholds that may be used by the conditions of - * a rule with arbitrary body. The thresholds may include only those that correspond to the subspace of the instance - * space that is covered by the rule. - */ -class IThresholdsSubset { - public: - - virtual ~IThresholdsSubset() {} - - /** - * Creates and returns a copy of this object. - * - * @return An unique pointer to an object of type `IThresholdsSubset` that has been created - */ - virtual std::unique_ptr copy() const = 0; - - /** - * Creates and returns a new instance of the type `IRuleRefinement` that allows to find the best refinement of - * an existing rule that predicts for all available labels. - * - * @param labelIndices A reference to an object of type `CompleteIndexVector` that provides access to the - * indices of the labels for which the existing rule predicts - * @param featureIndex The index of the feature that should be considered when searching for refinements - * @return An unique pointer to an object of type `IRuleRefinement` that has been created - */ - virtual std::unique_ptr createRuleRefinement(const CompleteIndexVector& labelIndices, - uint32 featureIndex) = 0; - - /** - * Creates and returns a new instance of the type `IRuleRefinement` that allows to find the best refinement of - * an existing rule that predicts for a subset of the available labels. - * - * @param labelIndices A reference to an object of type `PartialIndexVector` that provides access to the - * indices of the labels for which the existing rule predicts - * @param featureIndex The index of the feature that should be considered when searching for refinements - * @return An unique pointer to an object of type `IRuleRefinement` that has been created - */ - virtual std::unique_ptr createRuleRefinement(const PartialIndexVector& labelIndices, - uint32 featureIndex) = 0; - - /** - * Filters the thresholds such that only those thresholds, which correspond to the instance space that is - * covered by specific condition of a rule, are included. - * - * @param condition A reference to an object of type `Condition` that stores the properties of the condition - */ - virtual void filterThresholds(const Condition& condition) = 0; - - /** - * Resets the filtered thresholds. This reverts the effects of all previous calls to the function - * `filterThresholds`. - */ - virtual void resetThresholds() = 0; - - /** - * Returns an object of type `ICoverageState` that keeps track of the elements that are covered by the - * refinement that has been applied via the function `applyRefinement`. - * - * @return A reference to an object of type `ICoverageState` that keeps track of the elements that are covered - * by the refinement - */ - virtual const ICoverageState& getCoverageState() const = 0; - - /** - * Calculates and returns a numerical score that assesses the quality of a rule's prediction for all examples - * that do not belong to the current sub-sample and are marked as covered according to a given object of type - * `CoverageMask`. - * - * For calculating the quality, only examples that belong to the training set and are not included in the - * current sub-sample, i.e., only examples with zero weights, are considered. - * - * @param partition A reference to an object of type `SinglePartition` that provides access to the indices - * of the training examples that belong to the training set - * @param coverageState A reference to an object of type `CoverageMask` that keeps track of the examples that - * are covered by the rule - * @param head A reference to an object of type `IPrediction` that stores the scores that are predicted - * by the rule - * @return An object of type `Quality` that stores the calculated quality - */ - virtual Quality evaluateOutOfSample(const SinglePartition& partition, const CoverageMask& coverageState, - const IPrediction& head) const = 0; - - /** - * Calculates and returns a numerical score that assesses the quality of a rule's prediction for all examples - * that do not belong to the current sub-sample and are marked as covered according to a given object of type - * `CoverageMask`. - * - * For calculating the quality, only examples that belong to the training set and are not included in the - * current sub-sample, i.e., only examples with zero weights, are considered. - * - * @param partition A reference to an object of type `BiPartition` that provides access to the indices of - * the training examples that belong to the training set - * @param coverageState A reference to an object of type `CoverageMask` that keeps track of the examples that - * are covered by the rule - * @param head A reference to an object of type `IPrediction` that stores the scores that are predicted - * by the rule - * @return An object of type `Quality` that stores the calculated quality - */ - virtual Quality evaluateOutOfSample(const BiPartition& partition, const CoverageMask& coverageState, - const IPrediction& head) const = 0; - - /** - * Calculates and returns a numerical score that assesses the quality of a rule's prediction for all examples - * that do not belong to the current sub-sample and are marked as covered according to a given object of type - * `CoverageSet`. - * - * For calculating the quality, only examples that belong to the training set and are not included in the - * current sub-sample, i.e., only examples with zero weights, are considered. - * - * @param partition A reference to an object of type `SinglePartition` that provides access to the indices - * of the training examples that belong to the training set - * @param coverageState A reference to an object of type `CoverageSet` that keeps track of the examples that are - * covered by the rule - * @param head A reference to an object of type `IPrediction` that stores the scores that are predicted - * by the rule - * @return An object of type `Quality` that stores the calculated quality - */ - virtual Quality evaluateOutOfSample(const SinglePartition& partition, const CoverageSet& coverageState, - const IPrediction& head) const = 0; - - /** - * Calculates and returns a numerical score that assesses the quality of a rule's prediction for all examples - * that do not belong to the current sub-sample and are marked as covered according to a given object of type - * `CoverageSet`. - * - * For calculating the quality, only examples that belong to the training set and are not included in the - * current sub-sample, i.e., only examples with zero weights, are considered. - * - * @param partition A reference to an object of type `BiPartition` that provides access to the indices of - * the training examples that belong to the training set - * @param coverageState A reference to an object of type `CoverageSet` that keeps track of the examples that are - * covered by the rule - * @param head A reference to an object of type `IPrediction` that stores the scores that are predicted - * by the rule - * @return An object of type `Quality` that stores the calculated quality - */ - virtual Quality evaluateOutOfSample(BiPartition& partition, const CoverageSet& coverageState, - const IPrediction& head) const = 0; - - /** - * Recalculates and updates a rule's prediction based on all examples in the training set that are marked as - * covered according to a given object of type `CoverageMask`. - * - * When calculating the updated prediction, the weights of the individual training examples are ignored and - * equally distributed weights are assumed instead. - * - * @param partition A reference to an object of type `SinglePartition` that provides access to the indices - * of the training examples that belong to the training set - * @param coverageState A reference to an object of type `CoverageMask` that keeps track of the examples that - * are covered by the rule - * @param head A reference to an object of type `IPrediction` to be updated - */ - virtual void recalculatePrediction(const SinglePartition& partition, const CoverageMask& coverageState, - IPrediction& head) const = 0; - - /** - * Recalculates and updates a rule's prediction based on all examples in the training set that are marked as - * covered according to a given object of type `CoverageMask`. - * - * When calculating the updated prediction, the weights of the individual training examples are ignored and - * equally distributed weights are assumed instead. - * - * @param partition A reference to an object of type `BiPartition` that provides access to the indices of - * the training examples that belong to the training set - * @param coverageState A reference to an object of type `CoverageMask` that keeps track of the examples that - * are covered by the rule - * @param head A reference to an object of type `IPrediction` to be updated - */ - virtual void recalculatePrediction(const BiPartition& partition, const CoverageMask& coverageState, - IPrediction& head) const = 0; - - /** - * Recalculates and updates a rule's prediction based on all examples in the training set that are marked as - * covered according to a given object of type `CoverageSet`. - * - * When calculating the updated prediction, the weights of the individual training examples are ignored and - * equally distributed weights are assumed instead. - * - * @param partition A reference to an object of type `SinglePartition` that provides access to the indices - * of the training examples that belong to the training set - * @param coverageState A reference to an object of type `CoverageMask` that keeps track of the examples that - * are covered by the rule - * @param head A reference to an object of type `IPrediction` to be updated - */ - virtual void recalculatePrediction(const SinglePartition& partition, const CoverageSet& coverageState, - IPrediction& head) const = 0; - - /** - * Recalculates and updates a rule's prediction based on all examples in the training set that are marked as - * covered according to a given object of type `CoverageSet`. - * - * When calculating the updated prediction, the weights of the individual training examples are ignored and - * equally distributed weights are assumed instead. - * - * @param partition A reference to an object of type `BiPartition` that provides access to the indices of - * the training examples that belong to the training set - * @param coverageState A reference to an object of type `CoverageSet` that keeps track of the examples that are - * covered by the rule - * @param head A reference to an object of type `IPrediction` to be updated - */ - virtual void recalculatePrediction(BiPartition& partition, const CoverageSet& coverageState, - IPrediction& head) const = 0; - - /** - * Updates the statistics that correspond to the current subset based on the prediction of a rule. - * - * @param prediction A reference to an object of type `IPrediction` that stores the prediction of the rule - */ - virtual void applyPrediction(const IPrediction& prediction) = 0; - - /** - * Reverts the statistics that correspond to the current subset based on the predictions of a rule. - * - * @param prediction A reference to an object of type `IPrediction` that stores the prediction of the rule - */ - virtual void revertPrediction(const IPrediction& prediction) = 0; -}; diff --git a/cpp/subprojects/common/meson.build b/cpp/subprojects/common/meson.build index cf4df9071d..06bf01ab95 100644 --- a/cpp/subprojects/common/meson.build +++ b/cpp/subprojects/common/meson.build @@ -2,17 +2,13 @@ project('common', 'cpp') # Source files source_files = [ - 'src/mlrl/common/binning/bin_index_vector_dense.cpp', - 'src/mlrl/common/binning/bin_index_vector_dok.cpp', - 'src/mlrl/common/binning/bin_weight_vector.cpp', - 'src/mlrl/common/binning/feature_binning_equal_frequency.cpp', - 'src/mlrl/common/binning/feature_binning_equal_width.cpp', - 'src/mlrl/common/binning/feature_binning_no.cpp', - 'src/mlrl/common/binning/threshold_vector.cpp', 'src/mlrl/common/data/vector_bit.cpp', 'src/mlrl/common/indices/index_iterator.cpp', 'src/mlrl/common/indices/index_vector_complete.cpp', 'src/mlrl/common/indices/index_vector_partial.cpp', + 'src/mlrl/common/input/feature_binning_equal_frequency.cpp', + 'src/mlrl/common/input/feature_binning_equal_width.cpp', + 'src/mlrl/common/input/feature_binning_no.cpp', 'src/mlrl/common/input/feature_info_equal.cpp', 'src/mlrl/common/input/feature_info_mixed.cpp', 'src/mlrl/common/input/feature_matrix_c_contiguous.cpp', @@ -22,13 +18,12 @@ source_files = [ 'src/mlrl/common/input/feature_type_nominal.cpp', 'src/mlrl/common/input/feature_type_numerical.cpp', 'src/mlrl/common/input/feature_type_ordinal.cpp', - 'src/mlrl/common/input/feature_vector.cpp', + 'src/mlrl/common/input/feature_vector_binned.cpp', 'src/mlrl/common/input/feature_vector_equal.cpp', 'src/mlrl/common/input/feature_vector_nominal.cpp', 'src/mlrl/common/input/feature_vector_numerical.cpp', 'src/mlrl/common/input/label_matrix_c_contiguous.cpp', 'src/mlrl/common/input/label_matrix_csr.cpp', - 'src/mlrl/common/input/missing_feature_vector.cpp', 'src/mlrl/common/model/body_conjunctive.cpp', 'src/mlrl/common/model/body_empty.cpp', 'src/mlrl/common/model/condition_list.cpp', @@ -56,13 +51,14 @@ source_files = [ 'src/mlrl/common/rule_model_assemblage/rule_model_assemblage_sequential.cpp', 'src/mlrl/common/rule_pruning/rule_pruning_irep.cpp', 'src/mlrl/common/rule_pruning/rule_pruning_no.cpp', + 'src/mlrl/common/rule_refinement/coverage_mask.cpp', + 'src/mlrl/common/rule_refinement/feature_based_search.cpp', + 'src/mlrl/common/rule_refinement/feature_space_tabular.cpp', 'src/mlrl/common/rule_refinement/prediction_complete.cpp', 'src/mlrl/common/rule_refinement/prediction_partial.cpp', 'src/mlrl/common/rule_refinement/refinement_comparator_fixed.cpp', 'src/mlrl/common/rule_refinement/refinement_comparator_single.cpp', - 'src/mlrl/common/rule_refinement/rule_refinement_approximate.cpp', - 'src/mlrl/common/rule_refinement/rule_refinement_exact.cpp', - 'src/mlrl/common/rule_refinement/rule_refinement_search.cpp', + 'src/mlrl/common/rule_refinement/rule_refinement_feature_based.cpp', 'src/mlrl/common/rule_refinement/score_processor.cpp', 'src/mlrl/common/sampling/feature_sampling_no.cpp', 'src/mlrl/common/sampling/feature_sampling_predefined.cpp', @@ -93,10 +89,6 @@ source_files = [ 'src/mlrl/common/stopping/stopping_criterion_list.cpp', 'src/mlrl/common/stopping/stopping_criterion_size.cpp', 'src/mlrl/common/stopping/stopping_criterion_time.cpp', - 'src/mlrl/common/thresholds/coverage_mask.cpp', - 'src/mlrl/common/thresholds/coverage_set.cpp', - 'src/mlrl/common/thresholds/thresholds_approximate.cpp', - 'src/mlrl/common/thresholds/thresholds_exact.cpp', 'src/mlrl/common/info.cpp', 'src/mlrl/common/learner.cpp' ] @@ -106,10 +98,13 @@ test_files = [ 'test/mlrl/common/data/array.cpp', 'test/mlrl/common/data/vector_bit.cpp', 'test/mlrl/common/data/vector_dense.cpp', + 'test/mlrl/common/input/feature_binning_equal_frequency.cpp', + 'test/mlrl/common/input/feature_binning_equal_width.cpp', 'test/mlrl/common/input/feature_type_nominal.cpp', 'test/mlrl/common/input/feature_type_numerical.cpp', 'test/mlrl/common/input/feature_type_ordinal.cpp', 'test/mlrl/common/input/feature_vector_decorator_binary.cpp', + 'test/mlrl/common/input/feature_vector_decorator_binned.cpp', 'test/mlrl/common/input/feature_vector_decorator_nominal.cpp', 'test/mlrl/common/input/feature_vector_decorator_numerical.cpp', 'test/mlrl/common/input/feature_vector_decorator_ordinal.cpp', diff --git a/cpp/subprojects/common/src/mlrl/common/binning/bin_index_vector_dense.cpp b/cpp/subprojects/common/src/mlrl/common/binning/bin_index_vector_dense.cpp deleted file mode 100644 index b8832f94ca..0000000000 --- a/cpp/subprojects/common/src/mlrl/common/binning/bin_index_vector_dense.cpp +++ /dev/null @@ -1,19 +0,0 @@ -#include "mlrl/common/binning/bin_index_vector_dense.hpp" - -#include "mlrl/common/statistics/statistics_weighted.hpp" - -DenseBinIndexVector::DenseBinIndexVector(uint32 numElements) - : DenseVectorDecorator>(AllocatedVector(numElements)) {} - -uint32 DenseBinIndexVector::getBinIndex(uint32 exampleIndex) const { - return (*this)[exampleIndex]; -} - -void DenseBinIndexVector::setBinIndex(uint32 exampleIndex, uint32 binIndex) { - (*this)[exampleIndex] = binIndex; -} - -std::unique_ptr DenseBinIndexVector::createHistogram(const IWeightedStatistics& statistics, - uint32 numBins) const { - return statistics.createHistogram(*this, numBins); -} diff --git a/cpp/subprojects/common/src/mlrl/common/binning/bin_index_vector_dok.cpp b/cpp/subprojects/common/src/mlrl/common/binning/bin_index_vector_dok.cpp deleted file mode 100644 index 4e1e0d6aae..0000000000 --- a/cpp/subprojects/common/src/mlrl/common/binning/bin_index_vector_dok.cpp +++ /dev/null @@ -1,19 +0,0 @@ -#include "mlrl/common/binning/bin_index_vector_dok.hpp" - -#include "mlrl/common/statistics/statistics_weighted.hpp" - -DokBinIndexVector::DokBinIndexVector() - : DokVectorDecorator>(AllocatedDokVector(BIN_INDEX_SPARSE)) {} - -uint32 DokBinIndexVector::getBinIndex(uint32 exampleIndex) const { - return this->view[exampleIndex]; -} - -void DokBinIndexVector::setBinIndex(uint32 exampleIndex, uint32 binIndex) { - this->view.set(exampleIndex, binIndex); -} - -std::unique_ptr DokBinIndexVector::createHistogram(const IWeightedStatistics& statistics, - uint32 numBins) const { - return statistics.createHistogram(*this, numBins); -} diff --git a/cpp/subprojects/common/src/mlrl/common/binning/bin_weight_vector.cpp b/cpp/subprojects/common/src/mlrl/common/binning/bin_weight_vector.cpp deleted file mode 100644 index fc69bbb851..0000000000 --- a/cpp/subprojects/common/src/mlrl/common/binning/bin_weight_vector.cpp +++ /dev/null @@ -1,12 +0,0 @@ -#include "mlrl/common/binning/bin_weight_vector.hpp" - -BinWeightVector::BinWeightVector(uint32 numElements) - : ClearableViewDecorator>>(AllocatedVector(numElements)) {} - -void BinWeightVector::increaseWeight(uint32 pos) { - this->view.array[pos] += 1; -} - -bool BinWeightVector::operator[](uint32 pos) const { - return this->view.array[pos] != 0; -} diff --git a/cpp/subprojects/common/src/mlrl/common/binning/feature_binning_equal_frequency.cpp b/cpp/subprojects/common/src/mlrl/common/binning/feature_binning_equal_frequency.cpp deleted file mode 100644 index 8052e76139..0000000000 --- a/cpp/subprojects/common/src/mlrl/common/binning/feature_binning_equal_frequency.cpp +++ /dev/null @@ -1,231 +0,0 @@ -#include "mlrl/common/binning/feature_binning_equal_frequency.hpp" - -#include "feature_binning_nominal.hpp" -#include "mlrl/common/binning/bin_index_vector_dense.hpp" -#include "mlrl/common/binning/bin_index_vector_dok.hpp" -#include "mlrl/common/thresholds/thresholds_approximate.hpp" -#include "mlrl/common/util/math.hpp" -#include "mlrl/common/util/validation.hpp" - -static inline uint32 getNumBins(FeatureVector& featureVector, bool sparse, float32 binRatio, uint32 minBins, - uint32 maxBins) { - uint32 numElements = featureVector.getNumElements(); - - if (numElements > 0) { - featureVector.sortByValues(); - FeatureVector::const_iterator featureIterator = featureVector.cbegin(); - uint32 numDistinctValues = 1; - float32 previousValue; - uint32 i; - - if (sparse) { - previousValue = 0; - i = 0; - } else { - previousValue = featureIterator[0].value; - i = 1; - } - - for (; i < numElements; i++) { - float32 currentValue = featureIterator[i].value; - - if ((!sparse || currentValue != 0) && currentValue != previousValue) { - numDistinctValues++; - previousValue = currentValue; - } - } - - return numDistinctValues > 1 ? calculateBoundedFraction(numDistinctValues, binRatio, minBins, maxBins) : 0; - } - - return 0; -} - -/** - * An implementation of the type `IFeatureBinning` that assigns numerical feature values to bins, such that each bin - * contains approximately the same number of values. - */ -class EqualFrequencyFeatureBinning final : public IFeatureBinning { - private: - - const float32 binRatio_; - - const uint32 minBins_; - - const uint32 maxBins_; - - public: - - /** - * @param binRatio A percentage that specifies how many bins should be used, e.g., if 100 values are available, - * 0.5 means that `ceil(0.5 * 100) = 50` bins should be used. Must be in (0, 1) - * @param minBins The minimum number of bins to be used. Must be at least 2 - * @param maxBins The maximum number of bins to be used. Must be at least `minBins` or 0, if the maximum - * number of bins should not be restricted - */ - EqualFrequencyFeatureBinning(float32 binRatio, uint32 minBins, uint32 maxBins) - : binRatio_(binRatio), minBins_(minBins), maxBins_(maxBins) {} - - Result createBins(FeatureVector& featureVector, uint32 numExamples) const override { - Result result; - uint32 numElements = featureVector.getNumElements(); - uint32 numSparse = numExamples - numElements; - bool sparse = numSparse > 0; - uint32 numBins = getNumBins(featureVector, sparse, binRatio_, minBins_, maxBins_); - result.thresholdVectorPtr = std::make_unique(featureVector, numBins); - - if (sparse) { - result.binIndicesPtr = std::make_unique(); - } else { - result.binIndicesPtr = std::make_unique(numElements); - } - - if (numBins > 0) { - IBinIndexVector& binIndices = *result.binIndicesPtr; - ThresholdVector& thresholdVector = *result.thresholdVectorPtr; - FeatureVector::const_iterator featureIterator = featureVector.cbegin(); - ThresholdVector::iterator thresholdIterator = thresholdVector.begin(); - uint32 numElementsPerBin = (uint32) std::ceil((float) numElements / (float) numBins); - uint32 numElementsInCurrentBin = 0; - uint32 binIndex = 0; - float32 previousValue = 0; - uint32 i = 0; - - // Iterate feature values < 0... - for (; i < numElements; i++) { - float32 currentValue = featureIterator[i].value; - - if (currentValue >= 0) { - break; - } - - if (currentValue != previousValue) { - if (numElementsInCurrentBin >= numElementsPerBin) { - thresholdIterator[binIndex] = arithmeticMean(previousValue, currentValue); - binIndex++; - numElementsInCurrentBin = 0; - } - - previousValue = currentValue; - } - - binIndices.setBinIndex(featureIterator[i].index, binIndex); - numElementsInCurrentBin++; - } - - // If there are any sparse values, check if they belong to the current one or the next one... - if (sparse) { - previousValue = 0; - - if (numElementsInCurrentBin >= numElementsPerBin) { - thresholdIterator[binIndex] = arithmeticMean(previousValue, 0); - binIndex++; - numElementsInCurrentBin = 0; - } - - thresholdVector.setSparseBinIndex(binIndex); - numElementsInCurrentBin += numSparse; - } - - // Iterate feature values >= 0... - for (; i < numElements; i++) { - float32 currentValue = featureIterator[i].value; - - if (!sparse || currentValue != 0) { - if (currentValue != previousValue) { - if (numElementsInCurrentBin >= numElementsPerBin) { - thresholdIterator[binIndex] = arithmeticMean(previousValue, currentValue); - binIndex++; - numElementsInCurrentBin = 0; - } - - previousValue = currentValue; - } - - binIndices.setBinIndex(featureIterator[i].index, binIndex); - numElementsInCurrentBin++; - } - } - - thresholdVector.setNumElements(binIndex + 1, true); - } - - return result; - } -}; - -/** - * Allows to create instances of the type `IFeatureBinning` that assign numerical feature values to bins, such that each - * bin contains approximately the same number of values. - */ -class EqualFrequencyFeatureBinningFactory final : public IFeatureBinningFactory { - private: - - const float32 binRatio_; - - const uint32 minBins_; - - const uint32 maxBins_; - - public: - - /** - * @param binRatio A percentage that specifies how many bins should be used, e.g., if 100 values are available, - * a percentage of 0.5 means that `ceil(0.5 * 100) = 50` bins should be used. Must be in (0, 1) - * @param minBins The minimum number of bins to be used. Must be at least 2 - * @param maxBins The maximum number of bins to be used. Must be at least `minBins` or 0, if the maximum - * number of bins should not be restricted - */ - EqualFrequencyFeatureBinningFactory(float32 binRatio, uint32 minBins, uint32 maxBins) - : binRatio_(binRatio), minBins_(minBins), maxBins_(maxBins) {} - - std::unique_ptr create() const override { - return std::make_unique(binRatio_, minBins_, maxBins_); - } -}; - -EqualFrequencyFeatureBinningConfig::EqualFrequencyFeatureBinningConfig( - const std::unique_ptr& multiThreadingConfigPtr) - : binRatio_(0.33f), minBins_(2), maxBins_(0), multiThreadingConfigPtr_(multiThreadingConfigPtr) {} - -float32 EqualFrequencyFeatureBinningConfig::getBinRatio() const { - return binRatio_; -} - -IEqualFrequencyFeatureBinningConfig& EqualFrequencyFeatureBinningConfig::setBinRatio(float32 binRatio) { - assertGreater("binRatio", binRatio, 0); - assertLess("binRatio", binRatio, 1); - binRatio_ = binRatio; - return *this; -} - -uint32 EqualFrequencyFeatureBinningConfig::getMinBins() const { - return minBins_; -} - -IEqualFrequencyFeatureBinningConfig& EqualFrequencyFeatureBinningConfig::setMinBins(uint32 minBins) { - assertGreaterOrEqual("minBins", minBins, 2); - minBins_ = minBins; - return *this; -} - -uint32 EqualFrequencyFeatureBinningConfig::getMaxBins() const { - return maxBins_; -} - -IEqualFrequencyFeatureBinningConfig& EqualFrequencyFeatureBinningConfig::setMaxBins(uint32 maxBins) { - if (maxBins != 0) assertGreaterOrEqual("maxBins", maxBins, minBins_); - maxBins_ = maxBins; - return *this; -} - -std::unique_ptr EqualFrequencyFeatureBinningConfig::createThresholdsFactory( - const IFeatureMatrix& featureMatrix, const ILabelMatrix& labelMatrix) const { - std::unique_ptr numericalFeatureBinningFactoryPtr = - std::make_unique(binRatio_, minBins_, maxBins_); - std::unique_ptr nominalFeatureBinningFactoryPtr = - std::make_unique(); - uint32 numThreads = multiThreadingConfigPtr_->getNumThreads(featureMatrix, labelMatrix.getNumLabels()); - return std::make_unique(std::move(numericalFeatureBinningFactoryPtr), - std::move(nominalFeatureBinningFactoryPtr), numThreads); -} diff --git a/cpp/subprojects/common/src/mlrl/common/binning/feature_binning_equal_width.cpp b/cpp/subprojects/common/src/mlrl/common/binning/feature_binning_equal_width.cpp deleted file mode 100644 index 4a27901796..0000000000 --- a/cpp/subprojects/common/src/mlrl/common/binning/feature_binning_equal_width.cpp +++ /dev/null @@ -1,250 +0,0 @@ -#include "mlrl/common/binning/feature_binning_equal_width.hpp" - -#include "feature_binning_nominal.hpp" -#include "mlrl/common/binning/bin_index_vector_dense.hpp" -#include "mlrl/common/binning/bin_index_vector_dok.hpp" -#include "mlrl/common/data/array.hpp" -#include "mlrl/common/thresholds/thresholds_approximate.hpp" -#include "mlrl/common/util/math.hpp" -#include "mlrl/common/util/validation.hpp" - -#include -#include - -static inline std::tuple preprocess(const FeatureVector& featureVector, bool sparse, - float32 binRatio, uint32 minBins, uint32 maxBins) { - std::tuple result; - uint32 numElements = featureVector.getNumElements(); - - if (numElements > 0) { - FeatureVector::const_iterator featureIterator = featureVector.cbegin(); - float32 minValue; - uint32 i; - - if (sparse) { - minValue = 0; - i = 0; - } else { - minValue = featureIterator[0].value; - i = 1; - } - - float32 maxValue = minValue; - uint32 numDistinctValues = 1; - std::unordered_set distinctValues; - - for (; i < numElements; i++) { - float32 currentValue = featureIterator[i].value; - - if ((!sparse || currentValue != 0) && distinctValues.insert(currentValue).second) { - numDistinctValues++; - - if (currentValue < minValue) { - minValue = currentValue; - } - - if (currentValue > maxValue) { - maxValue = currentValue; - } - } - } - - std::get<0>(result) = - numDistinctValues > 1 ? calculateBoundedFraction(numDistinctValues, binRatio, minBins, maxBins) : 0; - std::get<1>(result) = minValue; - std::get<2>(result) = maxValue; - } else { - std::get<0>(result) = 0; - } - - return result; -} - -static inline uint32 getBinIndex(float32 value, float32 min, float32 width, uint32 numBins) { - uint32 binIndex = (uint32) std::floor((value - min) / width); - return binIndex >= numBins ? numBins - 1 : binIndex; -} - -/** - * An implementation of the type `IFeatureBinning` that assigns numerical feature values to bins, such that each bin - * contains values from equally sized value ranges. - */ -class EqualWidthFeatureBinning final : public IFeatureBinning { - private: - - const float32 binRatio_; - - const uint32 minBins_; - - const uint32 maxBins_; - - public: - - /** - * @param binRatio A percentage that specifies how many bins should be used, e.g., if 100 values are available, - * 0.5 means that `ceil(0.5 * 100) = 50` bins should be used. Must be in (0, 1) - * @param minBins The minimum number of bins to be used. Must be at least 2 - * @param maxBins The maximum number of bins to be used. Must be at least `minBins` or 0, if the maximum - * number of bins should not be restricted - */ - EqualWidthFeatureBinning(float32 binRatio, uint32 minBins, uint32 maxBins) - : binRatio_(binRatio), minBins_(minBins), maxBins_(maxBins) {} - - Result createBins(FeatureVector& featureVector, uint32 numExamples) const override { - Result result; - uint32 numElements = featureVector.getNumElements(); - bool sparse = numElements < numExamples; - std::tuple tuple = - preprocess(featureVector, sparse, binRatio_, minBins_, maxBins_); - uint32 numBins = std::get<0>(tuple); - result.thresholdVectorPtr = std::make_unique(featureVector, numBins, true); - - if (sparse) { - result.binIndicesPtr = std::make_unique(); - } else { - result.binIndicesPtr = std::make_unique(numElements); - } - - if (numBins > 0) { - IBinIndexVector& binIndices = *result.binIndicesPtr; - ThresholdVector& thresholdVector = *result.thresholdVectorPtr; - FeatureVector::const_iterator featureIterator = featureVector.cbegin(); - ThresholdVector::iterator thresholdIterator = thresholdVector.begin(); - float32 min = std::get<1>(tuple); - float32 max = std::get<2>(tuple); - float32 width = (max - min) / numBins; - uint32 sparseBinIndex; - - // If there are any sparse values, identify the bin they belong to... - if (sparse) { - sparseBinIndex = getBinIndex(0, min, width, numBins); - thresholdIterator[sparseBinIndex] = 1; - thresholdVector.setSparseBinIndex(sparseBinIndex); - } else { - sparseBinIndex = numBins; - } - - // Iterate all non-sparse feature values and identify the bins they belong to... - for (uint32 i = 0; i < numElements; i++) { - float32 currentValue = featureIterator[i].value; - - if (!sparse || currentValue != 0) { - uint32 binIndex = getBinIndex(currentValue, min, width, numBins); - - if (binIndex != sparseBinIndex) { - thresholdIterator[binIndex] = 1; - binIndices.setBinIndex(featureIterator[i].index, binIndex); - } - } - } - - // Remove empty bins and calculate thresholds... - Array mapping(numBins); - uint32 n = 0; - - for (uint32 i = 0; i < numBins; i++) { - mapping[i] = n; - - if (thresholdIterator[i] > 0) { - thresholdIterator[n] = min + ((i + 1) * width); - n++; - } - } - - thresholdVector.setNumElements(n, true); - - // Adjust bin indices... - DokBinIndexVector* dokBinIndices = dynamic_cast(&binIndices); - - if (dokBinIndices) { - for (auto it = dokBinIndices->begin(); it != dokBinIndices->end(); it++) { - uint32 binIndex = it->second; - it->second = mapping[binIndex]; - } - } else { - for (uint32 i = 0; i < numElements; i++) { - uint32 binIndex = binIndices.getBinIndex(i); - binIndices.setBinIndex(i, mapping[binIndex]); - } - } - } - - return result; - } -}; - -/** - * Allows to create instances of the type `IFeatureBinning` that assign numerical feature values to bins, such that each - * bin contains values from equally sized value ranges. - */ -class EqualWidthFeatureBinningFactory final : public IFeatureBinningFactory { - private: - - const float32 binRatio_; - - const uint32 minBins_; - - const uint32 maxBins_; - - public: - - /** - * @param binRatio A percentage that specifies how many bins should be used, e.g., if 100 values are available, - * 0.5 means that `ceil(0.5 * 100) = 50` bins should be used. Must be in (0, 1) - * @param minBins The minimum number of bins to be used. Must be at least 2 - * @param maxBins The maximum number of bins to be used. Must be at least `minBins` or 0, if the maximum - * number of bins should not be restricted - */ - EqualWidthFeatureBinningFactory(float32 binRatio, uint32 minBins, uint32 maxBins) - : binRatio_(binRatio), minBins_(minBins), maxBins_(maxBins) {} - - std::unique_ptr create() const override { - return std::make_unique(binRatio_, minBins_, maxBins_); - } -}; - -EqualWidthFeatureBinningConfig::EqualWidthFeatureBinningConfig( - const std::unique_ptr& multiThreadingConfigPtr) - : binRatio_(0.33f), minBins_(2), maxBins_(0), multiThreadingConfigPtr_(multiThreadingConfigPtr) {} - -float32 EqualWidthFeatureBinningConfig::getBinRatio() const { - return binRatio_; -} - -IEqualWidthFeatureBinningConfig& EqualWidthFeatureBinningConfig::setBinRatio(float32 binRatio) { - assertGreater("binRatio", binRatio, 0); - assertLess("binRatio", binRatio, 1); - binRatio_ = binRatio; - return *this; -} - -uint32 EqualWidthFeatureBinningConfig::getMinBins() const { - return minBins_; -} - -IEqualWidthFeatureBinningConfig& EqualWidthFeatureBinningConfig::setMinBins(uint32 minBins) { - assertGreaterOrEqual("minBins", minBins, 2); - minBins_ = minBins; - return *this; -} - -uint32 EqualWidthFeatureBinningConfig::getMaxBins() const { - return maxBins_; -} - -IEqualWidthFeatureBinningConfig& EqualWidthFeatureBinningConfig::setMaxBins(uint32 maxBins) { - if (maxBins != 0) assertGreaterOrEqual("maxBins", maxBins, minBins_); - maxBins_ = maxBins; - return *this; -} - -std::unique_ptr EqualWidthFeatureBinningConfig::createThresholdsFactory( - const IFeatureMatrix& featureMatrix, const ILabelMatrix& labelMatrix) const { - std::unique_ptr numericalFeatureBinningFactoryPtr = - std::make_unique(binRatio_, minBins_, maxBins_); - std::unique_ptr nominalFeatureBinningFactoryPtr = - std::make_unique(); - uint32 numThreads = multiThreadingConfigPtr_->getNumThreads(featureMatrix, labelMatrix.getNumLabels()); - return std::make_unique(std::move(numericalFeatureBinningFactoryPtr), - std::move(nominalFeatureBinningFactoryPtr), numThreads); -} diff --git a/cpp/subprojects/common/src/mlrl/common/binning/feature_binning_no.cpp b/cpp/subprojects/common/src/mlrl/common/binning/feature_binning_no.cpp deleted file mode 100644 index 25bd5dfad8..0000000000 --- a/cpp/subprojects/common/src/mlrl/common/binning/feature_binning_no.cpp +++ /dev/null @@ -1,12 +0,0 @@ -#include "mlrl/common/binning/feature_binning_no.hpp" - -#include "mlrl/common/thresholds/thresholds_exact.hpp" - -NoFeatureBinningConfig::NoFeatureBinningConfig(const std::unique_ptr& multiThreadingConfigPtr) - : multiThreadingConfigPtr_(multiThreadingConfigPtr) {} - -std::unique_ptr NoFeatureBinningConfig::createThresholdsFactory( - const IFeatureMatrix& featureMatrix, const ILabelMatrix& labelMatrix) const { - uint32 numThreads = multiThreadingConfigPtr_->getNumThreads(featureMatrix, labelMatrix.getNumLabels()); - return std::make_unique(numThreads); -} diff --git a/cpp/subprojects/common/src/mlrl/common/binning/feature_binning_nominal.hpp b/cpp/subprojects/common/src/mlrl/common/binning/feature_binning_nominal.hpp deleted file mode 100644 index 33b49c5ac3..0000000000 --- a/cpp/subprojects/common/src/mlrl/common/binning/feature_binning_nominal.hpp +++ /dev/null @@ -1,80 +0,0 @@ -/* - * @author Michael Rapp (michael.rapp.ml@gmail.com) - */ -#pragma once - -#include "mlrl/common/binning/bin_index_vector_dense.hpp" -#include "mlrl/common/binning/bin_index_vector_dok.hpp" -#include "mlrl/common/binning/feature_binning.hpp" - -#include - -/** - * An implementation of the type `IFeatureBinning` that assigns nominal feature values to bins, such that each bin - * contains one of the available values. - */ -class NominalFeatureBinning final : public IFeatureBinning { - public: - - Result createBins(FeatureVector& featureVector, uint32 numExamples) const override { - Result result; - uint32 numElements = featureVector.getNumElements(); - bool sparse = numElements < numExamples; - uint32 maxBins = sparse ? numElements + 1 : numElements; - result.thresholdVectorPtr = std::make_unique(featureVector, maxBins); - - if (sparse) { - result.binIndicesPtr = std::make_unique(); - } else { - result.binIndicesPtr = std::make_unique(numElements); - } - - if (numElements > 0) { - IBinIndexVector& binIndices = *result.binIndicesPtr; - ThresholdVector& thresholdVector = *result.thresholdVectorPtr; - FeatureVector::const_iterator featureIterator = featureVector.cbegin(); - ThresholdVector::iterator thresholdIterator = thresholdVector.begin(); - std::unordered_map mapping; - uint32 nextBinIndex = 0; - - if (sparse) { - thresholdVector.setSparseBinIndex(0); - thresholdIterator[0] = 0; - nextBinIndex++; - } - - for (uint32 i = 0; i < numElements; i++) { - float32 currentValue = featureIterator[i].value; - - if (!sparse || currentValue != 0) { - uint32 index = featureIterator[i].index; - auto mapIterator = mapping.emplace(currentValue, nextBinIndex); - - if (mapIterator.second) { - thresholdIterator[nextBinIndex] = currentValue; - binIndices.setBinIndex(index, nextBinIndex); - nextBinIndex++; - } else { - binIndices.setBinIndex(index, mapIterator.first->second); - } - } - } - - thresholdVector.setNumElements(nextBinIndex, true); - } - - return result; - } -}; - -/** - * Allows to create instances of the type `IFeatureBinning` that assign nominal feature values to bins, such that each - * bin contains one of the available values. - */ -class NominalFeatureBinningFactory final : public IFeatureBinningFactory { - public: - - std::unique_ptr create() const override { - return std::make_unique(); - } -}; diff --git a/cpp/subprojects/common/src/mlrl/common/binning/threshold_vector.cpp b/cpp/subprojects/common/src/mlrl/common/binning/threshold_vector.cpp deleted file mode 100644 index ac9877547e..0000000000 --- a/cpp/subprojects/common/src/mlrl/common/binning/threshold_vector.cpp +++ /dev/null @@ -1,28 +0,0 @@ -#include "mlrl/common/binning/threshold_vector.hpp" - -ThresholdVector::ThresholdVector(OldMissingFeatureVector& missingFeatureVector, uint32 numElements, bool init) - : ResizableVectorDecorator>>( - ResizableVector(numElements, init)), - OldMissingFeatureVector(missingFeatureVector), sparseBinIndex_(numElements) {} - -uint32 ThresholdVector::getSparseBinIndex() const { - return sparseBinIndex_; -} - -void ThresholdVector::setSparseBinIndex(uint32 sparseBinIndex) { - uint32 numElements = this->getNumElements(); - - if (sparseBinIndex > numElements) { - sparseBinIndex_ = numElements; - } else { - sparseBinIndex_ = sparseBinIndex; - } -} - -void ThresholdVector::setNumElements(uint32 numElements, bool freeMemory) { - ResizableVectorDecorator>>::setNumElements(numElements, freeMemory); - - if (sparseBinIndex_ > numElements) { - sparseBinIndex_ = numElements; - } -} diff --git a/cpp/subprojects/common/src/mlrl/common/indices/index_vector_complete.cpp b/cpp/subprojects/common/src/mlrl/common/indices/index_vector_complete.cpp index f951627fb4..4d4b488f5d 100644 --- a/cpp/subprojects/common/src/mlrl/common/indices/index_vector_complete.cpp +++ b/cpp/subprojects/common/src/mlrl/common/indices/index_vector_complete.cpp @@ -1,6 +1,6 @@ #include "mlrl/common/indices/index_vector_complete.hpp" -#include "mlrl/common/thresholds/thresholds_subset.hpp" +#include "mlrl/common/rule_refinement/feature_subspace.hpp" CompleteIndexVector::CompleteIndexVector(uint32 numElements) { numElements_ = numElements; @@ -30,7 +30,7 @@ CompleteIndexVector::const_iterator CompleteIndexVector::cend() const { return IndexIterator(numElements_); } -std::unique_ptr CompleteIndexVector::createRuleRefinement(IThresholdsSubset& thresholdsSubset, +std::unique_ptr CompleteIndexVector::createRuleRefinement(IFeatureSubspace& featureSubspace, uint32 featureIndex) const { - return thresholdsSubset.createRuleRefinement(*this, featureIndex); + return featureSubspace.createRuleRefinement(*this, featureIndex); } diff --git a/cpp/subprojects/common/src/mlrl/common/indices/index_vector_partial.cpp b/cpp/subprojects/common/src/mlrl/common/indices/index_vector_partial.cpp index e6594c90e8..374d16b71c 100644 --- a/cpp/subprojects/common/src/mlrl/common/indices/index_vector_partial.cpp +++ b/cpp/subprojects/common/src/mlrl/common/indices/index_vector_partial.cpp @@ -1,6 +1,6 @@ #include "mlrl/common/indices/index_vector_partial.hpp" -#include "mlrl/common/thresholds/thresholds_subset.hpp" +#include "mlrl/common/rule_refinement/feature_subspace.hpp" PartialIndexVector::PartialIndexVector(uint32 numElements, bool init) : ResizableVectorDecorator>>( @@ -18,7 +18,7 @@ uint32 PartialIndexVector::getIndex(uint32 pos) const { return (*this)[pos]; } -std::unique_ptr PartialIndexVector::createRuleRefinement(IThresholdsSubset& thresholdsSubset, +std::unique_ptr PartialIndexVector::createRuleRefinement(IFeatureSubspace& featureSubspace, uint32 featureIndex) const { - return thresholdsSubset.createRuleRefinement(*this, featureIndex); + return featureSubspace.createRuleRefinement(*this, featureIndex); } diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_binning_equal_frequency.cpp b/cpp/subprojects/common/src/mlrl/common/input/feature_binning_equal_frequency.cpp new file mode 100644 index 0000000000..f74c9033fe --- /dev/null +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_binning_equal_frequency.cpp @@ -0,0 +1,255 @@ +#include "mlrl/common/input/feature_binning_equal_frequency.hpp" + +#include "feature_type_numerical_common.hpp" +#include "feature_vector_decorator_binned.hpp" +#include "mlrl/common/util/math.hpp" +#include "mlrl/common/util/validation.hpp" + +static inline std::unique_ptr createFeatureVectorInternally( + AllocatedMissingFeatureVector&& missingFeatureVector, const NumericalFeatureVector& numericalFeatureVector, + uint32 numExamples, float32 binRatio, uint32 minBins, uint32 maxBins) { + uint32 numBins = calculateBoundedFraction(numExamples, binRatio, minBins, maxBins); + + if (numBins > 1) { + uint32 numElements = numericalFeatureVector.numElements; + AllocatedBinnedFeatureVector binnedFeatureVector(numBins, numElements); + AllocatedBinnedFeatureVector::threshold_iterator thresholdIterator = binnedFeatureVector.thresholds_begin(); + AllocatedBinnedFeatureVector::index_iterator indexIterator = binnedFeatureVector.indices; + AllocatedBinnedFeatureVector::index_iterator indptrIterator = binnedFeatureVector.indptr; + uint32 numElementsPerBin = (uint32) std::ceil((float64) numElements / (float64) numBins); + bool sparse = numericalFeatureVector.sparse; + float32 sparseValue = numericalFeatureVector.sparseValue; + float32 previousValue = sparseValue; + uint32 numElementsInCurrentBin = 0; + uint32 binIndex = 0; + uint32 numIndices = 0; + uint32 i = 0; + + // Iterate feature values `f < sparseValue`... + for (; i < numElements; i++) { + const IndexedValue& entry = numericalFeatureVector[i]; + float32 currentValue = entry.value; + + if (currentValue >= sparseValue) { + break; + } + + // Feature values that are equal to the previous one must not be assigned to a new bin... + if (!isEqual(currentValue, previousValue)) { + // Check, if the bin is fully occupied... + if (numElementsInCurrentBin >= numElementsPerBin) { + thresholdIterator[binIndex] = arithmeticMean(previousValue, currentValue); + indptrIterator[binIndex + 1] = numIndices; + numElementsInCurrentBin = 0; + binIndex++; + } + + previousValue = currentValue; + } + + indexIterator[numIndices] = entry.index; + numElementsInCurrentBin++; + numIndices++; + } + + // If there are any sparse values, check if they belong to the current one or the next one... + if (sparse) { + uint32 numSparseValues = numExamples - numElements; + + if (numElementsInCurrentBin >= numElementsPerBin) { + // The sparse values belong to the next bin... + thresholdIterator[binIndex] = arithmeticMean(previousValue, sparseValue); + indptrIterator[binIndex + 1] = numIndices; + numElementsInCurrentBin = numSparseValues; + binIndex++; + } else { + // The sparse values belong to the current bin... + numIndices -= numElementsInCurrentBin; + numElementsInCurrentBin += numSparseValues; + } + + // If the current bin is not fully occupied yet, the subsequent values do also belong to it... + previousValue = sparseValue; + + // Skip feature values that are equal to the previous one... + for (; i < numElements; i++) { + if (numericalFeatureVector[i].value != previousValue) { + break; + } + + numElementsInCurrentBin++; + } + } + + // Set the index of the sparse bin... + binnedFeatureVector.sparseBinIndex = binIndex; + + // Iterate feature values `f >= sparseValue`... + for (; i < numElements; i++) { + const IndexedValue& entry = numericalFeatureVector[i]; + float32 currentValue = entry.value; + + // Feature values that are equal to the previous one must not be assigned to a new bin... + if (!isEqual(currentValue, previousValue)) { + // Check, if the bin is fully occupied... + if (numElementsInCurrentBin >= numElementsPerBin) { + thresholdIterator[binIndex] = arithmeticMean(previousValue, currentValue); + indptrIterator[binIndex + 1] = numIndices; + numElementsInCurrentBin = 0; + binIndex++; + } + + previousValue = currentValue; + } + + indexIterator[numIndices] = entry.index; + numElementsInCurrentBin++; + numIndices++; + } + + if (binIndex > 0) { + binnedFeatureVector.resize(binIndex + 1, numIndices); + return std::make_unique(std::move(binnedFeatureVector), + std::move(missingFeatureVector)); + } + } + + return std::make_unique(); +} + +/** + * An implementation of the type `IFeatureBinning` that assigns numerical feature values to bins, such that each bin + * contains approximately the same number of values. + */ +class EqualFrequencyFeatureBinning final : public IFeatureBinning { + private: + + const float32 binRatio_; + + const uint32 minBins_; + + const uint32 maxBins_; + + public: + + /** + * @param binRatio A percentage that specifies how many bins should be used, e.g., if 100 values are available, + * 0.5 means that `ceil(0.5 * 100) = 50` bins should be used. Must be in (0, 1) + * @param minBins The minimum number of bins to be used. Must be at least 2 + * @param maxBins The maximum number of bins to be used. Must be at least `minBins` or 0, if the maximum + * number of bins should not be restricted + */ + EqualFrequencyFeatureBinning(float32 binRatio, uint32 minBins, uint32 maxBins) + : binRatio_(binRatio), minBins_(minBins), maxBins_(maxBins) {} + + std::unique_ptr createFeatureVector( + uint32 featureIndex, const FortranContiguousView& featureMatrix) const override { + // Create a numerical feature vector from the given feature matrix... + const std::unique_ptr featureVectorDecoratorPtr = + createNumericalFeatureVector(featureIndex, featureMatrix); + + // Check if all feature values are equal... + const NumericalFeatureVector& numericalFeatureVector = featureVectorDecoratorPtr->getView().firstView; + uint32 numElements = numericalFeatureVector.numElements; + + if (numElements > 0 + && !isEqual(numericalFeatureVector[0].value, numericalFeatureVector[numElements - 1].value)) { + return createFeatureVectorInternally(std::move(featureVectorDecoratorPtr->getView().secondView), + numericalFeatureVector, featureMatrix.numRows, binRatio_, minBins_, + maxBins_); + } + + return std::make_unique(); + } + + std::unique_ptr createFeatureVector( + uint32 featureIndex, const CscView& featureMatrix) const override { + // Create a numerical feature vector from the given feature matrix... + const std::unique_ptr featureVectorDecoratorPtr = + createNumericalFeatureVector(featureIndex, featureMatrix); + + // Check if all feature values are equal... + NumericalFeatureVector& numericalFeatureVector = featureVectorDecoratorPtr->getView().firstView; + uint32 numElements = numericalFeatureVector.numElements; + uint32 numExamples = featureMatrix.numRows; + + if (numElements > 0 + && (numElements < numExamples + || !isEqual(numericalFeatureVector[0].value, numericalFeatureVector[numElements - 1].value))) { + numericalFeatureVector.sparse = numElements < numExamples; + return createFeatureVectorInternally(std::move(featureVectorDecoratorPtr->getView().secondView), + numericalFeatureVector, numExamples, binRatio_, minBins_, + maxBins_); + } + + return std::make_unique(); + } +}; + +/** + * Allows to create instances of the type `IFeatureBinning` that assign numerical feature values to bins, such that each + * bin contains approximately the same number of values. + */ +class EqualFrequencyFeatureBinningFactory final : public IFeatureBinningFactory { + private: + + const float32 binRatio_; + + const uint32 minBins_; + + const uint32 maxBins_; + + public: + + /** + * @param binRatio A percentage that specifies how many bins should be used, e.g., if 100 values are available, + * a percentage of 0.5 means that `ceil(0.5 * 100) = 50` bins should be used. Must be in (0, 1) + * @param minBins The minimum number of bins to be used. Must be at least 2 + * @param maxBins The maximum number of bins to be used. Must be at least `minBins` or 0, if the maximum + * number of bins should not be restricted + */ + EqualFrequencyFeatureBinningFactory(float32 binRatio, uint32 minBins, uint32 maxBins) + : binRatio_(binRatio), minBins_(minBins), maxBins_(maxBins) {} + + std::unique_ptr create() const override { + return std::make_unique(binRatio_, minBins_, maxBins_); + } +}; + +EqualFrequencyFeatureBinningConfig::EqualFrequencyFeatureBinningConfig() : binRatio_(0.33f), minBins_(2), maxBins_(0) {} + +float32 EqualFrequencyFeatureBinningConfig::getBinRatio() const { + return binRatio_; +} + +IEqualFrequencyFeatureBinningConfig& EqualFrequencyFeatureBinningConfig::setBinRatio(float32 binRatio) { + assertGreater("binRatio", binRatio, 0); + assertLess("binRatio", binRatio, 1); + binRatio_ = binRatio; + return *this; +} + +uint32 EqualFrequencyFeatureBinningConfig::getMinBins() const { + return minBins_; +} + +IEqualFrequencyFeatureBinningConfig& EqualFrequencyFeatureBinningConfig::setMinBins(uint32 minBins) { + assertGreaterOrEqual("minBins", minBins, 2); + minBins_ = minBins; + return *this; +} + +uint32 EqualFrequencyFeatureBinningConfig::getMaxBins() const { + return maxBins_; +} + +IEqualFrequencyFeatureBinningConfig& EqualFrequencyFeatureBinningConfig::setMaxBins(uint32 maxBins) { + if (maxBins != 0) assertGreaterOrEqual("maxBins", maxBins, minBins_); + maxBins_ = maxBins; + return *this; +} + +std::unique_ptr EqualFrequencyFeatureBinningConfig::createFeatureBinningFactory( + const IFeatureMatrix& featureMatrix, const ILabelMatrix& labelMatrix) const { + return std::make_unique(binRatio_, minBins_, maxBins_); +} diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_binning_equal_width.cpp b/cpp/subprojects/common/src/mlrl/common/input/feature_binning_equal_width.cpp new file mode 100644 index 0000000000..79f263ab0c --- /dev/null +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_binning_equal_width.cpp @@ -0,0 +1,261 @@ +#include "mlrl/common/input/feature_binning_equal_width.hpp" + +#include "feature_type_numerical_common.hpp" +#include "feature_vector_decorator_binned.hpp" +#include "mlrl/common/data/array.hpp" +#include "mlrl/common/data/tuple.hpp" +#include "mlrl/common/util/math.hpp" +#include "mlrl/common/util/validation.hpp" + +static inline Tuple getMinAndMaxFeatureValue(const NumericalFeatureVector& numericalFeatureVector) { + uint32 numElements = numericalFeatureVector.numElements; + float32 min; + uint32 i; + + if (numericalFeatureVector.sparse) { + min = numericalFeatureVector.sparseValue; + i = 0; + } else { + min = numericalFeatureVector[0].value; + i = 1; + } + + float32 max = min; + + for (; i < numElements; i++) { + float32 currentValue = numericalFeatureVector[i].value; + + if (currentValue < min) { + min = currentValue; + } else if (currentValue > max) { + max = currentValue; + } + } + + return Tuple(min, max); +} + +static inline uint32 getBinIndex(float32 value, float32 min, float32 width, uint32 numBins) { + uint32 binIndex = (uint32) std::floor((value - min) / width); + return binIndex >= numBins ? numBins - 1 : binIndex; +} + +static inline std::unique_ptr createFeatureVectorInternally( + AllocatedMissingFeatureVector&& missingFeatureVector, const NumericalFeatureVector& numericalFeatureVector, + uint32 numExamples, float32 binRatio, uint32 minBins, uint32 maxBins) { + uint32 numWidths = calculateBoundedFraction(numExamples, binRatio, minBins, maxBins); + + if (numWidths > 0) { + const Tuple tuple = getMinAndMaxFeatureValue(numericalFeatureVector); + float32 min = tuple.first; + float32 max = tuple.second; + float32 width = (max - min) / numWidths; + uint32 numElements = numericalFeatureVector.numElements; + float32 sparseValue = numericalFeatureVector.sparseValue; + uint32 sparseBinIndex = getBinIndex(sparseValue, min, width, numWidths); + AllocatedBinnedFeatureVector binnedFeatureVector(numWidths, numElements, sparseBinIndex); + AllocatedBinnedFeatureVector::threshold_iterator thresholdIterator = binnedFeatureVector.thresholds_begin(); + AllocatedBinnedFeatureVector::index_iterator indptrIterator = binnedFeatureVector.indptr; + + // Iterate all non-sparse feature values and determine the bins they should be assigned to... + Array numExamplesPerBin(numWidths, true); + + for (uint32 i = 0; i < numElements; i++) { + float32 currentValue = numericalFeatureVector[i].value; + uint32 binIndex = getBinIndex(currentValue, min, width, numWidths); + + if (binIndex != sparseBinIndex) { + numExamplesPerBin[binIndex]++; + } + } + + // Remove empty bins and calculate thresholds... + Array mapping(numWidths); + uint32 numIndices = 0; + uint32 numBins = 0; + + for (uint32 i = 0; i < numWidths; i++) { + uint32 numExamplesInCurrentBin = numExamplesPerBin[i]; + + if (i == sparseBinIndex || numExamplesInCurrentBin > 0) { + thresholdIterator[numBins] = min + ((numBins + 1) * width); + indptrIterator[numBins] = numIndices; + + if (i != sparseBinIndex) { + numIndices += numExamplesInCurrentBin; + } else { + binnedFeatureVector.sparseBinIndex = numBins; + } + + mapping[i] = numBins; + numBins++; + } else { + mapping[i] = sparseBinIndex; + } + } + + // Set the indices of the examples that have been assigned to each bin... + for (uint32 i = 0; i < numElements; i++) { + const IndexedValue& entry = numericalFeatureVector[i]; + float32 currentValue = entry.value; + uint32 originalBinIndex = getBinIndex(currentValue, min, width, numWidths); + + if (originalBinIndex != sparseBinIndex) { + uint32 binIndex = mapping[originalBinIndex]; + + if (binIndex != binnedFeatureVector.sparseBinIndex) { + uint32 numExamplesInCurrentBin = numExamplesPerBin[originalBinIndex]; + uint32 numRemaining = numExamplesInCurrentBin - 1; + numExamplesPerBin[originalBinIndex] = numRemaining; + BinnedFeatureVector::index_iterator indexIterator = binnedFeatureVector.indices_begin(binIndex); + indexIterator[numRemaining] = entry.index; + } + } + } + + if (numBins > 1) { + binnedFeatureVector.resize(numBins, numIndices); + return std::make_unique(std::move(binnedFeatureVector), + std::move(missingFeatureVector)); + } + } + + return std::make_unique(); +} + +/** + * An implementation of the type `IFeatureBinning` that assigns numerical feature values to bins, such that each bin + * contains values from equally sized value ranges. + */ +class EqualWidthFeatureBinning final : public IFeatureBinning { + private: + + const float32 binRatio_; + + const uint32 minBins_; + + const uint32 maxBins_; + + public: + + /** + * @param binRatio A percentage that specifies how many bins should be used, e.g., if 100 values are available, + * 0.5 means that `ceil(0.5 * 100) = 50` bins should be used. Must be in (0, 1) + * @param minBins The minimum number of bins to be used. Must be at least 2 + * @param maxBins The maximum number of bins to be used. Must be at least `minBins` or 0, if the maximum + * number of bins should not be restricted + */ + EqualWidthFeatureBinning(float32 binRatio, uint32 minBins, uint32 maxBins) + : binRatio_(binRatio), minBins_(minBins), maxBins_(maxBins) {} + + std::unique_ptr createFeatureVector( + uint32 featureIndex, const FortranContiguousView& featureMatrix) const override { + // Create a numerical feature vector from the given feature matrix... + const std::unique_ptr featureVectorDecoratorPtr = + createNumericalFeatureVector(featureIndex, featureMatrix); + + // Check if all feature values are equal... + const NumericalFeatureVector& numericalFeatureVector = featureVectorDecoratorPtr->getView().firstView; + uint32 numElements = numericalFeatureVector.numElements; + + if (numElements > 0 + && !isEqual(numericalFeatureVector[0].value, numericalFeatureVector[numElements - 1].value)) { + return createFeatureVectorInternally(std::move(featureVectorDecoratorPtr->getView().secondView), + numericalFeatureVector, featureMatrix.numRows, binRatio_, minBins_, + maxBins_); + } + + return std::make_unique(); + } + + std::unique_ptr createFeatureVector( + uint32 featureIndex, const CscView& featureMatrix) const override { + // Create a numerical feature vector from the given feature matrix... + const std::unique_ptr featureVectorDecoratorPtr = + createNumericalFeatureVector(featureIndex, featureMatrix); + + // Check if all feature values are equal... + NumericalFeatureVector& numericalFeatureVector = featureVectorDecoratorPtr->getView().firstView; + uint32 numElements = numericalFeatureVector.numElements; + uint32 numExamples = featureMatrix.numRows; + + if (numElements > 0 + && (numElements < numExamples + || !isEqual(numericalFeatureVector[0].value, numericalFeatureVector[numElements - 1].value))) { + numericalFeatureVector.sparse = numElements < numExamples; + return createFeatureVectorInternally(std::move(featureVectorDecoratorPtr->getView().secondView), + numericalFeatureVector, numExamples, binRatio_, minBins_, + maxBins_); + } + + return std::make_unique(); + } +}; + +/** + * Allows to create instances of the type `IFeatureBinning` that assign numerical feature values to bins, such that each + * bin contains values from equally sized value ranges. + */ +class EqualWidthFeatureBinningFactory final : public IFeatureBinningFactory { + private: + + const float32 binRatio_; + + const uint32 minBins_; + + const uint32 maxBins_; + + public: + + /** + * @param binRatio A percentage that specifies how many bins should be used, e.g., if 100 values are available, + * 0.5 means that `ceil(0.5 * 100) = 50` bins should be used. Must be in (0, 1) + * @param minBins The minimum number of bins to be used. Must be at least 2 + * @param maxBins The maximum number of bins to be used. Must be at least `minBins` or 0, if the maximum + * number of bins should not be restricted + */ + EqualWidthFeatureBinningFactory(float32 binRatio, uint32 minBins, uint32 maxBins) + : binRatio_(binRatio), minBins_(minBins), maxBins_(maxBins) {} + + std::unique_ptr create() const override { + return std::make_unique(binRatio_, minBins_, maxBins_); + } +}; + +EqualWidthFeatureBinningConfig::EqualWidthFeatureBinningConfig() : binRatio_(0.33f), minBins_(2), maxBins_(0) {} + +float32 EqualWidthFeatureBinningConfig::getBinRatio() const { + return binRatio_; +} + +IEqualWidthFeatureBinningConfig& EqualWidthFeatureBinningConfig::setBinRatio(float32 binRatio) { + assertGreater("binRatio", binRatio, 0); + assertLess("binRatio", binRatio, 1); + binRatio_ = binRatio; + return *this; +} + +uint32 EqualWidthFeatureBinningConfig::getMinBins() const { + return minBins_; +} + +IEqualWidthFeatureBinningConfig& EqualWidthFeatureBinningConfig::setMinBins(uint32 minBins) { + assertGreaterOrEqual("minBins", minBins, 2); + minBins_ = minBins; + return *this; +} + +uint32 EqualWidthFeatureBinningConfig::getMaxBins() const { + return maxBins_; +} + +IEqualWidthFeatureBinningConfig& EqualWidthFeatureBinningConfig::setMaxBins(uint32 maxBins) { + if (maxBins != 0) assertGreaterOrEqual("maxBins", maxBins, minBins_); + maxBins_ = maxBins; + return *this; +} + +std::unique_ptr EqualWidthFeatureBinningConfig::createFeatureBinningFactory( + const IFeatureMatrix& featureMatrix, const ILabelMatrix& labelMatrix) const { + return std::make_unique(binRatio_, minBins_, maxBins_); +} diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_binning_no.cpp b/cpp/subprojects/common/src/mlrl/common/input/feature_binning_no.cpp new file mode 100644 index 0000000000..580bf60a0f --- /dev/null +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_binning_no.cpp @@ -0,0 +1,17 @@ +#include "mlrl/common/input/feature_binning_no.hpp" + +/** + * Allows to create instances of the type `IFeatureBinning` that do not actualy perform any feature binning. + */ +class NoFeatureBinningFactory final : public IFeatureBinningFactory { + public: + + std::unique_ptr create() const override { + return nullptr; + } +}; + +std::unique_ptr NoFeatureBinningConfig::createFeatureBinningFactory( + const IFeatureMatrix& featureMatrix, const ILabelMatrix& labelMatrix) const { + return std::make_unique(); +} diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_info_equal.cpp b/cpp/subprojects/common/src/mlrl/common/input/feature_info_equal.cpp index f5bf329218..a62ab69057 100644 --- a/cpp/subprojects/common/src/mlrl/common/input/feature_info_equal.cpp +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_info_equal.cpp @@ -5,27 +5,58 @@ #include "mlrl/common/input/feature_type_ordinal.hpp" /** - * An implementation of the type `IEqualFeatureInfo` that stores the type of all features. - * - * @tparam FeatureType The type of all features + * An implementation of the type `IEqualFeatureInfo` that creates an object of type `OrdinalFeatureType` for each + * feature. */ -template -class EqualFeatureInfo final : public IEqualFeatureInfo { +class OrdinalFeatureInfo final : public IEqualFeatureInfo { public: - std::unique_ptr createFeatureType(uint32 featureIndex) const override { - return std::make_unique(); + std::unique_ptr createFeatureType( + uint32 featureIndex, const IFeatureBinningFactory& featureBinningFactory) const override { + return std::make_unique(); + } +}; + +/** + * An implementation of the type `IEqualFeatureInfo` that creates an object of type `NominalFeatureType` for each + * feature. + */ +class NominalFeatureInfo final : public IEqualFeatureInfo { + public: + + std::unique_ptr createFeatureType( + uint32 featureIndex, const IFeatureBinningFactory& featureBinningFactory) const override { + return std::make_unique(); + } +}; + +/** + * An implementation of the type `IEqualFeatureInfo` that creates an object of type `IFeatureBinning` or + * `NumericalFeatureType` for each feature, depending on whether feature binning should be used or not. + */ +class NumericalFeatureInfo final : public IEqualFeatureInfo { + public: + + std::unique_ptr createFeatureType( + uint32 featureIndex, const IFeatureBinningFactory& featureBinningFactory) const override { + std::unique_ptr featureBinningPtr = featureBinningFactory.create(); + + if (featureBinningPtr) { + return featureBinningPtr; + } + + return std::make_unique(); } }; std::unique_ptr createOrdinalFeatureInfo() { - return std::make_unique>(); + return std::make_unique(); } std::unique_ptr createNominalFeatureInfo() { - return std::make_unique>(); + return std::make_unique(); } std::unique_ptr createNumericalFeatureInfo() { - return std::make_unique>(); + return std::make_unique(); } diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_info_mixed.cpp b/cpp/subprojects/common/src/mlrl/common/input/feature_info_mixed.cpp index cd9eaf3b6d..d8246326ac 100644 --- a/cpp/subprojects/common/src/mlrl/common/input/feature_info_mixed.cpp +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_info_mixed.cpp @@ -24,12 +24,19 @@ class BitFeatureInfo final : public IMixedFeatureInfo { BitFeatureInfo(uint32 numFeatures) : ordinalBitVector_(numFeatures, true), nominalBitVector_(numFeatures, true) {} - std::unique_ptr createFeatureType(uint32 featureIndex) const override { + std::unique_ptr createFeatureType( + uint32 featureIndex, const IFeatureBinningFactory& featureBinningFactory) const override { if (ordinalBitVector_[featureIndex]) { return std::make_unique(); } else if (nominalBitVector_[featureIndex]) { return std::make_unique(); } else { + std::unique_ptr featureBinningPtr = featureBinningFactory.create(); + + if (featureBinningPtr) { + return featureBinningPtr; + } + return std::make_unique(); } } diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_matrix_csc.cpp b/cpp/subprojects/common/src/mlrl/common/input/feature_matrix_csc.cpp index e450eb6bd9..68470d876b 100644 --- a/cpp/subprojects/common/src/mlrl/common/input/feature_matrix_csc.cpp +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_matrix_csc.cpp @@ -42,31 +42,6 @@ class CscFeatureMatrix final : public IterableSparseMatrixDecoratorgetNumCols(); } - void fetchFeatureVector(uint32 featureIndex, std::unique_ptr& featureVectorPtr) const override { - index_const_iterator indexIterator = this->indices_cbegin(featureIndex); - index_const_iterator indicesEnd = this->indices_cend(featureIndex); - value_const_iterator valueIterator = this->values_cbegin(featureIndex); - uint32 numElements = indicesEnd - indexIterator; - featureVectorPtr = std::make_unique(numElements); - FeatureVector::iterator vectorIterator = featureVectorPtr->begin(); - uint32 i = 0; - - for (uint32 j = 0; j < numElements; j++) { - uint32 index = indexIterator[j]; - float32 value = valueIterator[j]; - - if (std::isnan(value)) { - featureVectorPtr->addMissingIndex(index); - } else { - vectorIterator[i].index = index; - vectorIterator[i].value = value; - i++; - } - } - - featureVectorPtr->setNumElements(i, true); - } - std::unique_ptr createFeatureVector(uint32 featureIndex, const IFeatureType& featureType) const override { return featureType.createFeatureVector(featureIndex, this->getView()); diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_matrix_fortran_contiguous.cpp b/cpp/subprojects/common/src/mlrl/common/input/feature_matrix_fortran_contiguous.cpp index 42d935cd79..b881182de7 100644 --- a/cpp/subprojects/common/src/mlrl/common/input/feature_matrix_fortran_contiguous.cpp +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_matrix_fortran_contiguous.cpp @@ -37,28 +37,6 @@ class FortranContiguousFeatureMatrix final : public DenseMatrixDecoratorgetNumCols(); } - void fetchFeatureVector(uint32 featureIndex, std::unique_ptr& featureVectorPtr) const override { - value_const_iterator columnIterator = this->values_cbegin(featureIndex); - uint32 numElements = this->getNumRows(); - featureVectorPtr = std::make_unique(numElements); - FeatureVector::iterator vectorIterator = featureVectorPtr->begin(); - uint32 i = 0; - - for (uint32 j = 0; j < numElements; j++) { - float32 value = columnIterator[j]; - - if (std::isnan(value)) { - featureVectorPtr->addMissingIndex(j); - } else { - vectorIterator[i].index = j; - vectorIterator[i].value = value; - i++; - } - } - - featureVectorPtr->setNumElements(i, true); - } - std::unique_ptr createFeatureVector(uint32 featureIndex, const IFeatureType& featureType) const override { return featureType.createFeatureVector(featureIndex, this->getView()); diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_type_nominal.cpp b/cpp/subprojects/common/src/mlrl/common/input/feature_type_nominal.cpp index 8dc7974a12..1b39cdc7cc 100644 --- a/cpp/subprojects/common/src/mlrl/common/input/feature_type_nominal.cpp +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_type_nominal.cpp @@ -117,14 +117,6 @@ static inline std::unique_ptr createFeatureVectorInternally( sparse); } -bool NominalFeatureType::isOrdinal() const { - return false; -} - -bool NominalFeatureType::isNominal() const { - return true; -} - std::unique_ptr NominalFeatureType::createFeatureVector( uint32 featureIndex, const FortranContiguousView& featureMatrix) const { return createFeatureVectorInternally(featureIndex, featureMatrix); diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_type_numerical.cpp b/cpp/subprojects/common/src/mlrl/common/input/feature_type_numerical.cpp index c8d796dc4f..de6e1579dc 100644 --- a/cpp/subprojects/common/src/mlrl/common/input/feature_type_numerical.cpp +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_type_numerical.cpp @@ -1,56 +1,11 @@ #include "mlrl/common/input/feature_type_numerical.hpp" -#include "feature_vector_decorator_numerical.hpp" +#include "feature_type_numerical_common.hpp" #include "mlrl/common/input/feature_vector_equal.hpp" #include "mlrl/common/iterator/index_iterator.hpp" #include -template -static inline std::unique_ptr createNumericalFeatureVector(IndexIterator indexIterator, - ValueIterator valueIterator, - uint32 numElements) { - AllocatedNumericalFeatureVector numericalFeatureVector(numElements); - AllocatedMissingFeatureVector missingFeatureVector; - uint32 n = 0; - - for (uint32 i = 0; i < numElements; i++) { - uint32 index = indexIterator[i]; - float32 value = valueIterator[i]; - - if (std::isnan(value)) { - missingFeatureVector.set(index, true); - } else { - IndexedValue& entry = numericalFeatureVector[n]; - entry.index = index; - entry.value = value; - n++; - } - } - - numericalFeatureVector.resize(n, true); - std::sort(numericalFeatureVector.begin(), numericalFeatureVector.end(), IndexedValue::CompareValue()); - return std::make_unique(std::move(numericalFeatureVector), - std::move(missingFeatureVector)); -} - -static inline std::unique_ptr createNumericalFeatureVector( - uint32 featureIndex, const FortranContiguousView& featureMatrix) { - FortranContiguousView::value_const_iterator valueIterator = - featureMatrix.values_cbegin(featureIndex); - uint32 numRows = featureMatrix.numRows; - return createNumericalFeatureVector(IndexIterator(), valueIterator, numRows); -} - -static inline std::unique_ptr createNumericalFeatureVector( - uint32 featureIndex, const CscView& featureMatrix) { - CscView::index_const_iterator indexIterator = featureMatrix.indices_cbegin(featureIndex); - CscView::index_const_iterator indicesEnd = featureMatrix.indices_cend(featureIndex); - CscView::value_const_iterator valueIterator = featureMatrix.values_cbegin(featureIndex); - uint32 numIndices = indicesEnd - indexIterator; - return createNumericalFeatureVector(indexIterator, valueIterator, numIndices); -} - static inline std::unique_ptr createFeatureVectorInternally( uint32 featureIndex, const FortranContiguousView& featureMatrix) { std::unique_ptr featureVectorDecoratorPtr = @@ -87,14 +42,6 @@ static inline std::unique_ptr createFeatureVectorInternally( return std::make_unique(); } -bool NumericalFeatureType::isOrdinal() const { - return false; -} - -bool NumericalFeatureType::isNominal() const { - return false; -} - std::unique_ptr NumericalFeatureType::createFeatureVector( uint32 featureIndex, const FortranContiguousView& featureMatrix) const { return createFeatureVectorInternally(featureIndex, featureMatrix); diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_type_numerical_common.hpp b/cpp/subprojects/common/src/mlrl/common/input/feature_type_numerical_common.hpp new file mode 100644 index 0000000000..ef70ab5bca --- /dev/null +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_type_numerical_common.hpp @@ -0,0 +1,56 @@ +/* + * @author Michael Rapp (michael.rapp.ml@gmail.com) + */ +#pragma once + +#include "feature_vector_decorator_numerical.hpp" +#include "mlrl/common/data/view_matrix_csc.hpp" +#include "mlrl/common/data/view_matrix_fortran_contiguous.hpp" +#include "mlrl/common/iterator/index_iterator.hpp" + +#include + +template +static inline std::unique_ptr createNumericalFeatureVector(IndexIterator indexIterator, + ValueIterator valueIterator, + uint32 numElements) { + AllocatedNumericalFeatureVector numericalFeatureVector(numElements); + AllocatedMissingFeatureVector missingFeatureVector; + uint32 n = 0; + + for (uint32 i = 0; i < numElements; i++) { + uint32 index = indexIterator[i]; + float32 value = valueIterator[i]; + + if (std::isnan(value)) { + missingFeatureVector.set(index, true); + } else { + IndexedValue& entry = numericalFeatureVector[n]; + entry.index = index; + entry.value = value; + n++; + } + } + + numericalFeatureVector.resize(n, true); + std::sort(numericalFeatureVector.begin(), numericalFeatureVector.end(), IndexedValue::CompareValue()); + return std::make_unique(std::move(numericalFeatureVector), + std::move(missingFeatureVector)); +} + +static inline std::unique_ptr createNumericalFeatureVector( + uint32 featureIndex, const FortranContiguousView& featureMatrix) { + FortranContiguousView::value_const_iterator valueIterator = + featureMatrix.values_cbegin(featureIndex); + uint32 numRows = featureMatrix.numRows; + return createNumericalFeatureVector(IndexIterator(), valueIterator, numRows); +} + +static inline std::unique_ptr createNumericalFeatureVector( + uint32 featureIndex, const CscView& featureMatrix) { + CscView::index_const_iterator indexIterator = featureMatrix.indices_cbegin(featureIndex); + CscView::index_const_iterator indicesEnd = featureMatrix.indices_cend(featureIndex); + CscView::value_const_iterator valueIterator = featureMatrix.values_cbegin(featureIndex); + uint32 numIndices = indicesEnd - indexIterator; + return createNumericalFeatureVector(indexIterator, valueIterator, numIndices); +} diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_type_ordinal.cpp b/cpp/subprojects/common/src/mlrl/common/input/feature_type_ordinal.cpp index 78853bec8c..7273405f87 100644 --- a/cpp/subprojects/common/src/mlrl/common/input/feature_type_ordinal.cpp +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_type_ordinal.cpp @@ -130,14 +130,6 @@ static inline std::unique_ptr createFeatureVectorInternally( sparse); } -bool OrdinalFeatureType::isOrdinal() const { - return true; -} - -bool OrdinalFeatureType::isNominal() const { - return false; -} - std::unique_ptr OrdinalFeatureType::createFeatureVector( uint32 featureIndex, const FortranContiguousView& featureMatrix) const { return createFeatureVectorInternally(featureIndex, featureMatrix); diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_vector.cpp b/cpp/subprojects/common/src/mlrl/common/input/feature_vector.cpp deleted file mode 100644 index 6d28ccacff..0000000000 --- a/cpp/subprojects/common/src/mlrl/common/input/feature_vector.cpp +++ /dev/null @@ -1,11 +0,0 @@ -#include "mlrl/common/input/feature_vector.hpp" - -#include - -FeatureVector::FeatureVector(uint32 numElements) - : ResizableVectorDecorator>>>( - ResizableVector>(numElements)) {} - -void FeatureVector::sortByValues() { - std::sort(this->begin(), this->end(), IndexedValue::CompareValue()); -} diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_vector_binned.cpp b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_binned.cpp new file mode 100644 index 0000000000..1ea6dddebd --- /dev/null +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_binned.cpp @@ -0,0 +1,63 @@ +#include "mlrl/common/input/feature_vector_binned.hpp" + +BinnedFeatureVector::BinnedFeatureVector(float32* thresholds, uint32* indices, uint32* indptr, uint32 numBins, + uint32 numIndices, uint32 sparseBinIndex) + : thresholds(thresholds), indices(indices), indptr(indptr), numBins(numBins), sparseBinIndex(sparseBinIndex) {} + +BinnedFeatureVector::BinnedFeatureVector(const BinnedFeatureVector& other) + : thresholds(other.thresholds), indices(other.indices), indptr(other.indptr), numBins(other.numBins), + sparseBinIndex(other.sparseBinIndex) {} + +BinnedFeatureVector::BinnedFeatureVector(BinnedFeatureVector&& other) + : thresholds(other.thresholds), indices(other.indices), indptr(other.indptr), numBins(other.numBins), + sparseBinIndex(other.sparseBinIndex) {} + +BinnedFeatureVector::threshold_const_iterator BinnedFeatureVector::thresholds_cbegin() const { + return thresholds; +} + +BinnedFeatureVector::threshold_const_iterator BinnedFeatureVector::thresholds_cend() const { + return &thresholds[numBins - 1]; +} + +BinnedFeatureVector::threshold_iterator BinnedFeatureVector::thresholds_begin() { + return thresholds; +} + +BinnedFeatureVector::threshold_iterator BinnedFeatureVector::thresholds_end() { + return &thresholds[numBins - 1]; +} + +BinnedFeatureVector::index_const_iterator BinnedFeatureVector::indices_cbegin(uint32 index) const { + return &indices[indptr[index]]; +} + +BinnedFeatureVector::index_const_iterator BinnedFeatureVector::indices_cend(uint32 index) const { + return &indices[indptr[index + 1]]; +} + +BinnedFeatureVector::index_iterator BinnedFeatureVector::indices_begin(uint32 index) { + return &indices[indptr[index]]; +} + +BinnedFeatureVector::index_iterator BinnedFeatureVector::indices_end(uint32 index) { + return &indices[indptr[index + 1]]; +} + +BinnedFeatureVector::threshold_type* BinnedFeatureVector::releaseThresholds() { + threshold_type* ptr = thresholds; + thresholds = nullptr; + return ptr; +} + +BinnedFeatureVector::index_type* BinnedFeatureVector::releaseIndices() { + index_type* ptr = indices; + indices = nullptr; + return ptr; +} + +BinnedFeatureVector::index_type* BinnedFeatureVector::releaseIndptr() { + index_type* ptr = indptr; + indptr = nullptr; + return ptr; +} diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_vector_binned_allocated.hpp b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_binned_allocated.hpp new file mode 100644 index 0000000000..2523308cfc --- /dev/null +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_binned_allocated.hpp @@ -0,0 +1,65 @@ +/* + * @author Michael Rapp (michael.rapp.ml@gmail.com) + */ +#pragma once + +#include "mlrl/common/input/feature_vector_binned.hpp" + +/** + * Allocates the memory, a `BinnedFeatureVector` provides access to. + */ +class MLRLCOMMON_API AllocatedBinnedFeatureVector : public BinnedFeatureVector { + public: + + /** + * @param numBins The number of bins, including the most frequent one + * @param numIndices The number of examples not associated with the most frequent bin + * @param sparseBinIndex The index of the most frequent bin + */ + AllocatedBinnedFeatureVector(uint32 numBins, uint32 numIndices, uint32 sparseBinIndex = 0) + : BinnedFeatureVector(allocateMemory(numBins - 1), allocateMemory(numIndices), + allocateMemory(numBins + 1), numBins, numIndices, sparseBinIndex) { + BinnedFeatureVector::indptr[0] = 0; + BinnedFeatureVector::indptr[numBins] = numIndices; + } + + /** + * @param other A reference to an object of type `AllocatedBinnedFeatureVector` that should be copied + */ + AllocatedBinnedFeatureVector(const AllocatedBinnedFeatureVector& other) : BinnedFeatureVector(other) { + throw std::runtime_error("Objects of type AllocatedBinnedFeatureVector cannot be copied"); + } + + /** + * @param other A reference to an object of type `AllocatedBinnedFeatureVector` that should be moved + */ + AllocatedBinnedFeatureVector(AllocatedBinnedFeatureVector&& other) : BinnedFeatureVector(std::move(other)) { + other.releaseThresholds(); + other.releaseIndices(); + other.releaseIndptr(); + } + + virtual ~AllocatedBinnedFeatureVector() override { + freeMemory(BinnedFeatureVector::thresholds); + freeMemory(BinnedFeatureVector::indices); + freeMemory(BinnedFeatureVector::indptr); + } + + /** + * Resizes the view by re-allocating the memory it provides access to. + * + * @param numValues The number of bins, including the most frequent one + * @param numIndices The number of examples not associated with the most frequent bin + */ + void resize(uint32 numBins, uint32 numIndices) { + BinnedFeatureVector::thresholds = reallocateMemory(BinnedFeatureVector::thresholds, numBins - 1); + BinnedFeatureVector::indices = reallocateMemory(BinnedFeatureVector::indices, numIndices); + BinnedFeatureVector::indptr = reallocateMemory(BinnedFeatureVector::indptr, numBins + 1); + BinnedFeatureVector::numBins = numBins; + BinnedFeatureVector::indptr[numBins] = numIndices; + + if (BinnedFeatureVector::sparseBinIndex >= numBins) { + BinnedFeatureVector::sparseBinIndex = numBins - 1; + } + } +}; diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator.hpp b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator.hpp index bc465f4e33..ada1e3ee76 100644 --- a/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator.hpp +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator.hpp @@ -41,7 +41,7 @@ static inline std::unique_ptr createFilteredFeatureVectorDecorator(co uint32 index = *it; it++; // Iterator must be incremented before call to `MissingFeatureVector::set` invalidates it - if (!coverageMask.isCovered(index)) { + if (!coverageMask[index]) { missingFeatureVector.set(index, false); } } @@ -56,7 +56,7 @@ static inline std::unique_ptr createFilteredFeatureVectorDecorator(co it++) { uint32 index = *it; - if (coverageMask.isCovered(index)) { + if (coverageMask[index]) { missingFeatureVector.set(index, true); } } diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_binary.hpp b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_binary.hpp index e2837b8637..491033c9e4 100644 --- a/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_binary.hpp +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_binary.hpp @@ -3,13 +3,14 @@ */ #pragma once +#include "feature_vector_decorator_binned_common.hpp" #include "feature_vector_decorator_nominal_common.hpp" /** * Provides random read and write access, as well as read and write access via iterators, to the values and indicies of * training examples stored in an `BinaryFeatureVector`. */ -class BinaryFeatureVectorDecorator final : public AbstractNominalFeatureVectorDecorator { +class BinaryFeatureVectorDecorator final : public AbstractBinnedFeatureVectorDecorator { public: /** @@ -18,30 +19,29 @@ class BinaryFeatureVectorDecorator final : public AbstractNominalFeatureVectorDe */ BinaryFeatureVectorDecorator(AllocatedNominalFeatureVector&& firstView, AllocatedMissingFeatureVector&& secondView) - : AbstractNominalFeatureVectorDecorator(std::move(firstView), std::move(secondView)) {} + : AbstractBinnedFeatureVectorDecorator(std::move(firstView), + std::move(secondView)) {} /** * @param other A reference to an object of type `BinaryFeatureVectorDecorator` that should be copied */ BinaryFeatureVectorDecorator(const BinaryFeatureVectorDecorator& other) - : AbstractNominalFeatureVectorDecorator(other) {} - - void searchForRefinement(RuleRefinementSearch& ruleRefinementSearch, - IWeightedStatisticsSubset& statisticsSubset, SingleRefinementComparator& comparator, - uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, - Refinement& refinement) const override { - ruleRefinementSearch.searchForBinaryRefinement(this->view.firstView, this->view.secondView, - statisticsSubset, comparator, numExamplesWithNonZeroWeights, - minCoverage, refinement); + : AbstractBinnedFeatureVectorDecorator(other) {} + + void searchForRefinement(FeatureBasedSearch& featureBasedSearch, IWeightedStatisticsSubset& statisticsSubset, + SingleRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, + uint32 minCoverage, Refinement& refinement) const override { + featureBasedSearch.searchForBinaryRefinement(this->view.firstView, this->view.secondView, statisticsSubset, + comparator, numExamplesWithNonZeroWeights, minCoverage, + refinement); } - void searchForRefinement(RuleRefinementSearch& ruleRefinementSearch, - IWeightedStatisticsSubset& statisticsSubset, FixedRefinementComparator& comparator, - uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, - Refinement& refinement) const override { - ruleRefinementSearch.searchForBinaryRefinement(this->view.firstView, this->view.secondView, - statisticsSubset, comparator, numExamplesWithNonZeroWeights, - minCoverage, refinement); + void searchForRefinement(FeatureBasedSearch& featureBasedSearch, IWeightedStatisticsSubset& statisticsSubset, + FixedRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, + uint32 minCoverage, Refinement& refinement) const override { + featureBasedSearch.searchForBinaryRefinement(this->view.firstView, this->view.secondView, statisticsSubset, + comparator, numExamplesWithNonZeroWeights, minCoverage, + refinement); } std::unique_ptr createFilteredFeatureVector(std::unique_ptr& existing, diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_binned.hpp b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_binned.hpp new file mode 100644 index 0000000000..8f6075b51d --- /dev/null +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_binned.hpp @@ -0,0 +1,304 @@ +/* + * @author Michael Rapp (michael.rapp.ml@gmail.com) + */ +#pragma once + +#include "feature_vector_binned_allocated.hpp" +#include "feature_vector_decorator_binned_common.hpp" + +#include + +template +static inline std::optional createFilteredBinnedFeatureVectorView( + const Decorator& decorator, std::unique_ptr& existing, const Interval& interval) { + const BinnedFeatureVector& featureVector = decorator.getView().firstView; + Tuple tuple = getStartAndEndOfOpenInterval(interval, featureVector.numBins); + uint32 start = tuple.first; + uint32 end = tuple.second; + uint32 numFilteredBins = end - start; + + if (numFilteredBins > 0) { + uint32 sparseBinIndex = featureVector.sparseBinIndex; + sparseBinIndex = sparseBinIndex >= start ? sparseBinIndex - start : 0; + sparseBinIndex = sparseBinIndex >= numFilteredBins ? numFilteredBins - 1 : sparseBinIndex; + return BinnedFeatureVector(&featureVector.thresholds[start], featureVector.indices, + &featureVector.indptr[start], numFilteredBins, + featureVector.indptr[featureVector.numBins], sparseBinIndex); + } + + return {}; +} + +template +static inline std::unique_ptr createFilteredBinnedFeatureVectorDecorator( + const View& view, std::unique_ptr& existing, const CoverageMask& coverageMask) { + std::unique_ptr filteredDecoratorPtr = + createFilteredFeatureVectorDecorator(view, existing, coverageMask); + + // Filter the indices of examples not associated with the majority value... + const BinnedFeatureVector& featureVector = view.getView().firstView; + AllocatedBinnedFeatureVector& filteredFeatureVector = filteredDecoratorPtr->getView().firstView; + AllocatedBinnedFeatureVector::index_iterator filteredIndexIterator = filteredFeatureVector.indices; + AllocatedBinnedFeatureVector::index_iterator filteredIndptrIterator = filteredFeatureVector.indptr; + AllocatedBinnedFeatureVector::threshold_iterator filteredThresholdIterator = filteredFeatureVector.thresholds; + uint32 numFilteredBins = 0; + uint32 numFilteredIndices = 0; + + for (uint32 i = 0; i < featureVector.numBins; i++) { + BinnedFeatureVector::index_const_iterator indexIterator = featureVector.indices_cbegin(i); + BinnedFeatureVector::index_const_iterator indicesEnd = featureVector.indices_cend(i); + uint32 numIndices = indicesEnd - indexIterator; + uint32 indptr = numFilteredIndices; + + for (uint32 j = 0; j < numIndices; j++) { + uint32 index = indexIterator[j]; + + if (coverageMask[index]) { + filteredIndexIterator[numFilteredIndices] = index; + numFilteredIndices++; + } + } + + if (numFilteredIndices > indptr) { + if (numFilteredBins >= filteredFeatureVector.sparseBinIndex) { + filteredFeatureVector.sparseBinIndex = numFilteredBins; + } + + filteredIndptrIterator[numFilteredBins] = indptr; + + if (i < featureVector.numBins - 1) { + filteredThresholdIterator[numFilteredBins] = featureVector.thresholds[i]; + } + + numFilteredBins++; + } + } + + if (numFilteredIndices > 0) { + filteredFeatureVector.resize(numFilteredBins, numFilteredIndices); + return filteredDecoratorPtr; + } + + return std::make_unique(); +} + +// Forward declarations +class BinnedFeatureVectorDecorator; + +/** + * Provides random read and write access, as well as read and write access via iterators, to the values and thresholds + * stored in a `BinnedFeatureVector`. + */ +class BinnedFeatureVectorView final : public AbstractFeatureVectorDecorator { + public: + + /** + * @param firstView A reference to an object of type `BinnedFeatureVector` + */ + BinnedFeatureVectorView(BinnedFeatureVector&& firstView) + : AbstractFeatureVectorDecorator(std::move(firstView), AllocatedMissingFeatureVector()) {} + + void searchForRefinement(FeatureBasedSearch& featureBasedSearch, IWeightedStatisticsSubset& statisticsSubset, + SingleRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, + uint32 minCoverage, Refinement& refinement) const override { + featureBasedSearch.searchForBinnedRefinement(this->view.firstView, this->view.secondView, statisticsSubset, + comparator, numExamplesWithNonZeroWeights, minCoverage, + refinement); + } + + void searchForRefinement(FeatureBasedSearch& featureBasedSearch, IWeightedStatisticsSubset& statisticsSubset, + FixedRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, + uint32 minCoverage, Refinement& refinement) const override { + featureBasedSearch.searchForBinnedRefinement(this->view.firstView, this->view.secondView, statisticsSubset, + comparator, numExamplesWithNonZeroWeights, minCoverage, + refinement); + } + + void updateCoverageMaskAndStatistics(const Interval& interval, CoverageMask& coverageMask, + uint32 indicatorValue, + IWeightedStatistics& statistics) const override final { + updateCoverageMaskAndStatisticsBasedOnBinnedFeatureVector( + *this, interval, coverageMask, indicatorValue, statistics); + } + + std::unique_ptr createFilteredFeatureVector(std::unique_ptr& existing, + const Interval& interval) const override { + std::optional filteredFeatureVector = + createFilteredBinnedFeatureVectorView(*this, existing, interval); + + if (filteredFeatureVector) { + return std::make_unique(std::move(*filteredFeatureVector)); + } + + return std::make_unique(); + } + + std::unique_ptr createFilteredFeatureVector(std::unique_ptr& existing, + const CoverageMask& coverageMask) const override { + return createFilteredBinnedFeatureVectorDecorator( + *this, existing, coverageMask); + } +}; + +/** + * Provides random read and write access, as well as read and write access via iterators, to a subset of the indices and + * thresholds stored in a `AllocatedBinnedFeatureVector`. + */ +class AllocatedBinnedFeatureVectorView final : public AbstractFeatureVectorDecorator { + public: + + /** + * The `AllocatedBinnedFeatureVector`, the view provides access to. + */ + AllocatedBinnedFeatureVector allocatedView; + + /** + * @param allocatedView A reference to an object of type `AllocatedBinnedFeatureVector` + * @param firstView A reference to an object of type `BinnedFeatureVector` + */ + AllocatedBinnedFeatureVectorView(AllocatedBinnedFeatureVector&& allocatedView, BinnedFeatureVector&& firstView) + : AbstractFeatureVectorDecorator(std::move(firstView), + AllocatedMissingFeatureVector()), + allocatedView(std::move(allocatedView)) {} + + void searchForRefinement(FeatureBasedSearch& featureBasedSearch, IWeightedStatisticsSubset& statisticsSubset, + SingleRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, + uint32 minCoverage, Refinement& refinement) const override { + featureBasedSearch.searchForBinnedRefinement(this->view.firstView, this->view.secondView, statisticsSubset, + comparator, numExamplesWithNonZeroWeights, minCoverage, + refinement); + } + + void searchForRefinement(FeatureBasedSearch& featureBasedSearch, IWeightedStatisticsSubset& statisticsSubset, + FixedRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, + uint32 minCoverage, Refinement& refinement) const override { + featureBasedSearch.searchForBinnedRefinement(this->view.firstView, this->view.secondView, statisticsSubset, + comparator, numExamplesWithNonZeroWeights, minCoverage, + refinement); + } + + void updateCoverageMaskAndStatistics(const Interval& interval, CoverageMask& coverageMask, + uint32 indicatorValue, + IWeightedStatistics& statistics) const override final { + updateCoverageMaskAndStatisticsBasedOnBinnedFeatureVector( + *this, interval, coverageMask, indicatorValue, statistics); + } + + std::unique_ptr createFilteredFeatureVector(std::unique_ptr& existing, + const Interval& interval) const override { + std::optional filteredFeatureVector = + createFilteredBinnedFeatureVectorView(*this, existing, interval); + + if (filteredFeatureVector) { + AllocatedBinnedFeatureVectorView* existingView = + dynamic_cast(existing.get()); + + if (existingView) { + return std::make_unique(std::move(existingView->allocatedView), + std::move(*filteredFeatureVector)); + } + + return std::make_unique(std::move(*filteredFeatureVector)); + } + + return std::make_unique(); + } + + std::unique_ptr createFilteredFeatureVector(std::unique_ptr& existing, + const CoverageMask& coverageMask) const override { + return createFilteredBinnedFeatureVectorDecorator(*this, existing, + coverageMask); + } +}; + +/** + * Provides random read and write access, as well as read and write access via iterators, to the values and thresholds + * stored in an `AllocatedBinnedFeatureVector`. + */ +class BinnedFeatureVectorDecorator final : public AbstractBinnedFeatureVectorDecorator { + public: + + /** + * @param firstView A reference to an object of type `AllocatedBinnedFeatureVector` + * @param secondView A reference to an object of type `AllocatedMissingFeatureVector` + */ + BinnedFeatureVectorDecorator(AllocatedBinnedFeatureVector&& firstView, + AllocatedMissingFeatureVector&& secondView) + : AbstractBinnedFeatureVectorDecorator(std::move(firstView), + std::move(secondView)) {} + + /** + * @param other A reference to an object of type `BinnedFeatureVectorDecorator` that should be copied + */ + BinnedFeatureVectorDecorator(const BinnedFeatureVectorDecorator& other) + : BinnedFeatureVectorDecorator( + AllocatedBinnedFeatureVector(other.view.firstView.numBins, + other.view.firstView.indptr[other.view.firstView.numBins], + other.view.firstView.sparseBinIndex), + AllocatedMissingFeatureVector()) {} + + /** + * @param other A reference to an object of type `BinnedFeatureVectorView` that should be copied + */ + BinnedFeatureVectorDecorator(const BinnedFeatureVectorView& other) + : BinnedFeatureVectorDecorator( + AllocatedBinnedFeatureVector(other.getView().firstView.numBins, + other.getView().firstView.indptr[other.getView().firstView.numBins], + other.getView().firstView.sparseBinIndex), + AllocatedMissingFeatureVector()) {} + + /** + * @param other A reference to an object of type `AllocatedBinnedFeatureVectorView` that should be copied + */ + BinnedFeatureVectorDecorator(const AllocatedBinnedFeatureVectorView& other) + : BinnedFeatureVectorDecorator( + AllocatedBinnedFeatureVector(other.getView().firstView.numBins, + other.getView().firstView.indptr[other.getView().firstView.numBins], + other.getView().firstView.sparseBinIndex), + AllocatedMissingFeatureVector()) {} + + void searchForRefinement(FeatureBasedSearch& featureBasedSearch, IWeightedStatisticsSubset& statisticsSubset, + SingleRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, + uint32 minCoverage, Refinement& refinement) const override { + featureBasedSearch.searchForBinnedRefinement(this->view.firstView, this->view.secondView, statisticsSubset, + comparator, numExamplesWithNonZeroWeights, minCoverage, + refinement); + } + + void searchForRefinement(FeatureBasedSearch& featureBasedSearch, IWeightedStatisticsSubset& statisticsSubset, + FixedRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, + uint32 minCoverage, Refinement& refinement) const override { + featureBasedSearch.searchForBinnedRefinement(this->view.firstView, this->view.secondView, statisticsSubset, + comparator, numExamplesWithNonZeroWeights, minCoverage, + refinement); + } + + std::unique_ptr createFilteredFeatureVector(std::unique_ptr& existing, + const Interval& interval) const override { + std::optional filteredFeatureVector = + createFilteredBinnedFeatureVectorView(*this, existing, interval); + + if (filteredFeatureVector) { + BinnedFeatureVectorDecorator* existingDecorator = + dynamic_cast(existing.get()); + + if (existingDecorator) { + return std::make_unique( + std::move(existingDecorator->view.firstView), std::move(*filteredFeatureVector)); + } + + return std::make_unique(std::move(*filteredFeatureVector)); + } + + return std::make_unique(); + } + + std::unique_ptr createFilteredFeatureVector(std::unique_ptr& existing, + const CoverageMask& coverageMask) const override { + return createFilteredBinnedFeatureVectorDecorator(*this, existing, + coverageMask); + } +}; diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_binned_common.hpp b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_binned_common.hpp new file mode 100644 index 0000000000..8061284fed --- /dev/null +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_binned_common.hpp @@ -0,0 +1,91 @@ +/* + * @author Michael Rapp (michael.rapp.ml@gmail.com) + */ +#pragma once + +#include "feature_vector_decorator.hpp" +#include "feature_vector_nominal_allocated.hpp" +#include "mlrl/common/input/feature_vector_equal.hpp" + +template +static inline void updateCoverageMaskAndStatisticsBasedOnBinnedFeatureVector(const View& view, const Interval& interval, + CoverageMask& coverageMask, + uint32 indicatorValue, + IWeightedStatistics& statistics) { + const FeatureVector& featureVector = view.getView().firstView; + CoverageMask::iterator coverageMaskIterator = coverageMask.begin(); + + if (interval.inverse) { + // Discard the indices that correspond to the values in the range [interval.start, interval.end) and set the + // corresponding values in `coverageMask` to `indicatorValue`, which marks them as uncovered... + for (uint32 i = interval.start; i < interval.end; i++) { + typename FeatureVector::index_const_iterator indexIterator = featureVector.indices_cbegin(i); + typename FeatureVector::index_const_iterator indicesEnd = featureVector.indices_cend(i); + uint32 numIndices = indicesEnd - indexIterator; + + for (uint32 j = 0; j < numIndices; j++) { + uint32 index = indexIterator[j]; + coverageMaskIterator[index] = indicatorValue; + statistics.removeCoveredStatistic(index); + } + } + + updateCoverageMaskAndStatisticsBasedOnMissingFeatureVector(view, coverageMaskIterator, indicatorValue, + statistics); + } else { + coverageMask.indicatorValue = indicatorValue; + statistics.resetCoveredStatistics(); + + // Retain the indices in the range [interval.start, interval.end) and set the corresponding values in the given + // `coverageMask` to `indicatorValue` to mark them as covered... + for (uint32 i = interval.start; i < interval.end; i++) { + typename FeatureVector::index_const_iterator indexIterator = featureVector.indices_cbegin(i); + typename FeatureVector::index_const_iterator indicesEnd = featureVector.indices_cend(i); + uint32 numIndices = indicesEnd - indexIterator; + + for (uint32 j = 0; j < numIndices; j++) { + uint32 index = indexIterator[j]; + coverageMaskIterator[index] = indicatorValue; + statistics.addCoveredStatistic(index); + } + } + } +} + +/** + * An abstract base class for all decorators that provide access to the bins stored in a feature vector. + * + * @tparam AllocatedFeatureVector The type of the view that provides access to the bins in the feature vector + */ +template +class AbstractBinnedFeatureVectorDecorator : public AbstractFeatureVectorDecorator { + public: + + /** + * @param firstView A reference to an object of template type `AllocatedFeatureVector` + * @param secondView A reference to an object of type `AllocatedMissingFeatureVector` + */ + AbstractBinnedFeatureVectorDecorator(AllocatedFeatureVector&& firstView, + AllocatedMissingFeatureVector&& secondView) + : AbstractFeatureVectorDecorator(std::move(firstView), std::move(secondView)) {} + + /** + * @param other A reference to an object of type `AbstractBinnedFeatureVectorDecorator` that should be copied + */ + AbstractBinnedFeatureVectorDecorator(const AbstractBinnedFeatureVectorDecorator& other) + : AbstractBinnedFeatureVectorDecorator( + AllocatedFeatureVector(other.view.firstView.numValues, + other.view.firstView.indptr[other.view.firstView.numValues], + other.view.firstView.majorityValue), + AllocatedMissingFeatureVector()) {} + + virtual ~AbstractBinnedFeatureVectorDecorator() override {} + + void updateCoverageMaskAndStatistics(const Interval& interval, CoverageMask& coverageMask, + uint32 indicatorValue, + IWeightedStatistics& statistics) const override final { + updateCoverageMaskAndStatisticsBasedOnBinnedFeatureVector( + *this, interval, coverageMask, indicatorValue, statistics); + } +}; diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_nominal.hpp b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_nominal.hpp index f2136b4ec3..138aa7c038 100644 --- a/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_nominal.hpp +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_nominal.hpp @@ -3,6 +3,7 @@ */ #pragma once +#include "feature_vector_decorator_binned_common.hpp" #include "feature_vector_decorator_nominal_common.hpp" template @@ -71,7 +72,7 @@ static inline std::unique_ptr createFilteredNominalFeatureVector * Provides random read and write access, as well as read and write access via iterators, to the values and indices of * training examples stored in an `AllocatedNominalFeatureVector`. */ -class NominalFeatureVectorDecorator final : public AbstractNominalFeatureVectorDecorator { +class NominalFeatureVectorDecorator final : public AbstractBinnedFeatureVectorDecorator { public: /** @@ -80,30 +81,29 @@ class NominalFeatureVectorDecorator final : public AbstractNominalFeatureVectorD */ NominalFeatureVectorDecorator(AllocatedNominalFeatureVector&& firstView, AllocatedMissingFeatureVector&& secondView) - : AbstractNominalFeatureVectorDecorator(std::move(firstView), std::move(secondView)) {} + : AbstractBinnedFeatureVectorDecorator(std::move(firstView), + std::move(secondView)) {} /** * @param other A reference to an object of type `NominalFeatureVectorDecorator` that should be copied */ NominalFeatureVectorDecorator(const NominalFeatureVectorDecorator& other) - : AbstractNominalFeatureVectorDecorator(other) {} - - void searchForRefinement(RuleRefinementSearch& ruleRefinementSearch, - IWeightedStatisticsSubset& statisticsSubset, SingleRefinementComparator& comparator, - uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, - Refinement& refinement) const override { - ruleRefinementSearch.searchForNominalRefinement(this->view.firstView, this->view.secondView, - statisticsSubset, comparator, numExamplesWithNonZeroWeights, - minCoverage, refinement); + : AbstractBinnedFeatureVectorDecorator(other) {} + + void searchForRefinement(FeatureBasedSearch& featureBasedSearch, IWeightedStatisticsSubset& statisticsSubset, + SingleRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, + uint32 minCoverage, Refinement& refinement) const override { + featureBasedSearch.searchForNominalRefinement(this->view.firstView, this->view.secondView, statisticsSubset, + comparator, numExamplesWithNonZeroWeights, minCoverage, + refinement); } - void searchForRefinement(RuleRefinementSearch& ruleRefinementSearch, - IWeightedStatisticsSubset& statisticsSubset, FixedRefinementComparator& comparator, - uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, - Refinement& refinement) const override { - ruleRefinementSearch.searchForNominalRefinement(this->view.firstView, this->view.secondView, - statisticsSubset, comparator, numExamplesWithNonZeroWeights, - minCoverage, refinement); + void searchForRefinement(FeatureBasedSearch& featureBasedSearch, IWeightedStatisticsSubset& statisticsSubset, + FixedRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, + uint32 minCoverage, Refinement& refinement) const override { + featureBasedSearch.searchForNominalRefinement(this->view.firstView, this->view.secondView, statisticsSubset, + comparator, numExamplesWithNonZeroWeights, minCoverage, + refinement); } std::unique_ptr createFilteredFeatureVector(std::unique_ptr& existing, diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_nominal_common.hpp b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_nominal_common.hpp index 506f0bedb3..c871065d50 100644 --- a/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_nominal_common.hpp +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_nominal_common.hpp @@ -3,56 +3,9 @@ */ #pragma once -#include "feature_vector_decorator.hpp" #include "feature_vector_nominal_allocated.hpp" #include "mlrl/common/input/feature_vector_equal.hpp" -template -static inline void updateCoverageMaskAndStatisticsBasedOnNominalFeatureVector(const View& view, - const Interval& interval, - CoverageMask& coverageMask, - uint32 indicatorValue, - IWeightedStatistics& statistics) { - const NominalFeatureVector& featureVector = view.getView().firstView; - CoverageMask::iterator coverageMaskIterator = coverageMask.begin(); - - if (interval.inverse) { - // Discard the indices that correspond to the values in the range [interval.start, interval.end) and set the - // corresponding values in `coverageMask` to `indicatorValue`, which marks them as uncovered... - for (uint32 i = interval.start; i < interval.end; i++) { - NominalFeatureVector::index_const_iterator indexIterator = featureVector.indices_cbegin(i); - NominalFeatureVector::index_const_iterator indicesEnd = featureVector.indices_cend(i); - uint32 numIndices = indicesEnd - indexIterator; - - for (uint32 j = 0; j < numIndices; j++) { - uint32 index = indexIterator[j]; - coverageMaskIterator[index] = indicatorValue; - statistics.removeCoveredStatistic(index); - } - } - - updateCoverageMaskAndStatisticsBasedOnMissingFeatureVector(view, coverageMaskIterator, indicatorValue, - statistics); - } else { - coverageMask.setIndicatorValue(indicatorValue); - statistics.resetCoveredStatistics(); - - // Retain the indices in the range [interval.start, interval.end) and set the corresponding values in the given - // `coverageMask` to `indicatorValue` to mark them as covered... - for (uint32 i = interval.start; i < interval.end; i++) { - NominalFeatureVector::index_const_iterator indexIterator = featureVector.indices_cbegin(i); - NominalFeatureVector::index_const_iterator indicesEnd = featureVector.indices_cend(i); - uint32 numIndices = indicesEnd - indexIterator; - - for (uint32 j = 0; j < numIndices; j++) { - uint32 index = indexIterator[j]; - coverageMaskIterator[index] = indicatorValue; - statistics.addCoveredStatistic(index); - } - } - } -} - template static inline std::unique_ptr createFilteredNominalFeatureVectorDecorator( const View& view, std::unique_ptr& existing, const CoverageMask& coverageMask) { @@ -77,7 +30,7 @@ static inline std::unique_ptr createFilteredNominalFeatureVector for (uint32 j = 0; j < numIndices; j++) { uint32 index = indexIterator[j]; - if (coverageMask.isCovered(index)) { + if (coverageMask[index]) { filteredIndexIterator[numFilteredIndices] = index; numFilteredIndices++; } @@ -97,39 +50,3 @@ static inline std::unique_ptr createFilteredNominalFeatureVector return std::make_unique(); } - -/** - * An abstract base class for all decorators that provide access to the values and indices of training examples stored - * in an `AllocatedNominalFeatureVector`. - */ -class AbstractNominalFeatureVectorDecorator : public AbstractFeatureVectorDecorator { - public: - - /** - * @param firstView A reference to an object of type `AllocatedNominalFeatureVector` - * @param secondView A reference to an object of type `AllocatedMissingFeatureVector` - */ - AbstractNominalFeatureVectorDecorator(AllocatedNominalFeatureVector&& firstView, - AllocatedMissingFeatureVector&& secondView) - : AbstractFeatureVectorDecorator(std::move(firstView), - std::move(secondView)) {} - - /** - * @param other A reference to an object of type `AbstractNominalFeatureVectorDecorator` that should be copied - */ - AbstractNominalFeatureVectorDecorator(const AbstractNominalFeatureVectorDecorator& other) - : AbstractNominalFeatureVectorDecorator( - AllocatedNominalFeatureVector(other.view.firstView.numValues, - other.view.firstView.indptr[other.view.firstView.numValues], - other.view.firstView.majorityValue), - AllocatedMissingFeatureVector()) {} - - virtual ~AbstractNominalFeatureVectorDecorator() override {} - - void updateCoverageMaskAndStatistics(const Interval& interval, CoverageMask& coverageMask, - uint32 indicatorValue, - IWeightedStatistics& statistics) const override final { - updateCoverageMaskAndStatisticsBasedOnNominalFeatureVector(*this, interval, coverageMask, indicatorValue, - statistics); - } -}; diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_numerical.hpp b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_numerical.hpp index da2f246cef..aa6107d445 100644 --- a/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_numerical.hpp +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_numerical.hpp @@ -13,27 +13,9 @@ template static inline std::optional createFilteredNumericalFeatureVectorView( const Decorator& decorator, std::unique_ptr& existing, const Interval& interval) { const NumericalFeatureVector& featureVector = decorator.getView().firstView; - uint32 start = interval.start; - uint32 end = interval.end; - - if (interval.inverse) { - if (interval.start > 0) { - start = 0; - end = interval.start; - } else { - start = interval.end; - end = featureVector.numElements; - } - } else { - start = interval.start; - - if (start > 0) { - end = featureVector.numElements; - } else { - end = interval.end; - } - } - + Tuple tuple = getStartAndEndOfOpenInterval(interval, featureVector.numElements); + uint32 start = tuple.first; + uint32 end = tuple.second; uint32 numFilteredElements = end - start; if (numFilteredElements > 0 @@ -61,7 +43,7 @@ static inline std::unique_ptr createFilteredNumericalFeatureVect for (uint32 i = 0; i < filteredFeatureVector.numElements; i++) { const IndexedValue& entry = iterator[i]; - if (coverageMask.isCovered(entry.index)) { + if (coverageMask[entry.index]) { filteredIterator[numFilteredElements] = entry; numFilteredElements++; } @@ -90,22 +72,20 @@ class AbstractNumericalFeatureVectorDecorator : public AbstractFeatureVectorDeco virtual ~AbstractNumericalFeatureVectorDecorator() override {} - void searchForRefinement(RuleRefinementSearch& ruleRefinementSearch, - IWeightedStatisticsSubset& statisticsSubset, SingleRefinementComparator& comparator, - uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, - Refinement& refinement) const override { - ruleRefinementSearch.searchForNumericalRefinement(this->view.firstView, this->view.secondView, - statisticsSubset, comparator, - numExamplesWithNonZeroWeights, minCoverage, refinement); + void searchForRefinement(FeatureBasedSearch& featureBasedSearch, IWeightedStatisticsSubset& statisticsSubset, + SingleRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, + uint32 minCoverage, Refinement& refinement) const override { + featureBasedSearch.searchForNumericalRefinement(this->view.firstView, this->view.secondView, + statisticsSubset, comparator, numExamplesWithNonZeroWeights, + minCoverage, refinement); } - void searchForRefinement(RuleRefinementSearch& ruleRefinementSearch, - IWeightedStatisticsSubset& statisticsSubset, FixedRefinementComparator& comparator, - uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, - Refinement& refinement) const override { - ruleRefinementSearch.searchForNumericalRefinement(this->view.firstView, this->view.secondView, - statisticsSubset, comparator, - numExamplesWithNonZeroWeights, minCoverage, refinement); + void searchForRefinement(FeatureBasedSearch& featureBasedSearch, IWeightedStatisticsSubset& statisticsSubset, + FixedRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, + uint32 minCoverage, Refinement& refinement) const override { + featureBasedSearch.searchForNumericalRefinement(this->view.firstView, this->view.secondView, + statisticsSubset, comparator, numExamplesWithNonZeroWeights, + minCoverage, refinement); } void updateCoverageMaskAndStatistics(const Interval& interval, CoverageMask& coverageMask, @@ -126,7 +106,7 @@ class AbstractNumericalFeatureVectorDecorator : public AbstractFeatureVectorDeco updateCoverageMaskAndStatisticsBasedOnMissingFeatureVector(*this, coverageMaskIterator, indicatorValue, statistics); } else { - coverageMask.setIndicatorValue(indicatorValue); + coverageMask.indicatorValue = indicatorValue; statistics.resetCoveredStatistics(); // Retain the indices in the range [interval.start, interval.end) and set the corresponding values in diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_ordinal.hpp b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_ordinal.hpp index ded0da3b1c..791bcf78e0 100644 --- a/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_ordinal.hpp +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_decorator_ordinal.hpp @@ -3,6 +3,7 @@ */ #pragma once +#include "feature_vector_decorator_binned_common.hpp" #include "feature_vector_decorator_nominal_common.hpp" #include @@ -11,27 +12,9 @@ template static inline std::optional createFilteredOrdinalFeatureVectorView( const Decorator& decorator, std::unique_ptr& existing, const Interval& interval) { const NominalFeatureVector& featureVector = decorator.getView().firstView; - uint32 start; - uint32 end; - - if (interval.inverse) { - if (interval.start > 0) { - start = 0; - end = interval.start; - } else { - start = interval.end; - end = featureVector.numValues; - } - } else { - start = interval.start; - - if (start > 0) { - end = featureVector.numValues; - } else { - end = interval.end; - } - } - + Tuple tuple = getStartAndEndOfOpenInterval(interval, featureVector.numValues); + uint32 start = tuple.first; + uint32 end = tuple.second; uint32 numFilteredValues = end - start; if (numFilteredValues > 0) { @@ -59,29 +42,27 @@ class OrdinalFeatureVectorView final : public AbstractFeatureVectorDecoratorview.firstView, this->view.secondView, - statisticsSubset, comparator, numExamplesWithNonZeroWeights, - minCoverage, refinement); + void searchForRefinement(FeatureBasedSearch& featureBasedSearch, IWeightedStatisticsSubset& statisticsSubset, + SingleRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, + uint32 minCoverage, Refinement& refinement) const override { + featureBasedSearch.searchForOrdinalRefinement(this->view.firstView, this->view.secondView, statisticsSubset, + comparator, numExamplesWithNonZeroWeights, minCoverage, + refinement); } - void searchForRefinement(RuleRefinementSearch& ruleRefinementSearch, - IWeightedStatisticsSubset& statisticsSubset, FixedRefinementComparator& comparator, - uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, - Refinement& refinement) const override { - ruleRefinementSearch.searchForOrdinalRefinement(this->view.firstView, this->view.secondView, - statisticsSubset, comparator, numExamplesWithNonZeroWeights, - minCoverage, refinement); + void searchForRefinement(FeatureBasedSearch& featureBasedSearch, IWeightedStatisticsSubset& statisticsSubset, + FixedRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, + uint32 minCoverage, Refinement& refinement) const override { + featureBasedSearch.searchForOrdinalRefinement(this->view.firstView, this->view.secondView, statisticsSubset, + comparator, numExamplesWithNonZeroWeights, minCoverage, + refinement); } void updateCoverageMaskAndStatistics(const Interval& interval, CoverageMask& coverageMask, uint32 indicatorValue, IWeightedStatistics& statistics) const override final { - updateCoverageMaskAndStatisticsBasedOnNominalFeatureVector(*this, interval, coverageMask, indicatorValue, - statistics); + updateCoverageMaskAndStatisticsBasedOnBinnedFeatureVector( + *this, interval, coverageMask, indicatorValue, statistics); } std::unique_ptr createFilteredFeatureVector(std::unique_ptr& existing, @@ -125,29 +106,28 @@ class AllocatedOrdinalFeatureVectorView final : public AbstractFeatureVectorDeco AllocatedMissingFeatureVector()), allocatedView(std::move(allocatedView)) {} - void searchForRefinement(RuleRefinementSearch& ruleRefinementSearch, - IWeightedStatisticsSubset& statisticsSubset, SingleRefinementComparator& comparator, - uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, - Refinement& refinement) const override { - ruleRefinementSearch.searchForOrdinalRefinement(this->view.firstView, this->view.secondView, - statisticsSubset, comparator, numExamplesWithNonZeroWeights, - minCoverage, refinement); + void searchForRefinement(FeatureBasedSearch& featureBasedSearch, IWeightedStatisticsSubset& statisticsSubset, + SingleRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, + uint32 minCoverage, Refinement& refinement) const override { + featureBasedSearch.searchForOrdinalRefinement(this->view.firstView, this->view.secondView, statisticsSubset, + comparator, numExamplesWithNonZeroWeights, minCoverage, + refinement); } - void searchForRefinement(RuleRefinementSearch& ruleRefinementSearch, - IWeightedStatisticsSubset& statisticsSubset, FixedRefinementComparator& comparator, - uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, - Refinement& refinement) const override { - ruleRefinementSearch.searchForOrdinalRefinement(this->view.firstView, this->view.secondView, - statisticsSubset, comparator, numExamplesWithNonZeroWeights, - minCoverage, refinement); + void searchForRefinement(FeatureBasedSearch& featureBasedSearch, IWeightedStatisticsSubset& statisticsSubset, + FixedRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, + uint32 minCoverage, Refinement& refinement) const override { + featureBasedSearch.searchForOrdinalRefinement(this->view.firstView, this->view.secondView, statisticsSubset, + comparator, numExamplesWithNonZeroWeights, minCoverage, + refinement); } void updateCoverageMaskAndStatistics(const Interval& interval, CoverageMask& coverageMask, uint32 indicatorValue, IWeightedStatistics& statistics) const override final { - updateCoverageMaskAndStatisticsBasedOnNominalFeatureVector(*this, interval, coverageMask, indicatorValue, - statistics); + updateCoverageMaskAndStatisticsBasedOnBinnedFeatureVector( + *this, interval, coverageMask, indicatorValue, statistics); } std::unique_ptr createFilteredFeatureVector(std::unique_ptr& existing, @@ -182,7 +162,7 @@ class AllocatedOrdinalFeatureVectorView final : public AbstractFeatureVectorDeco * Provides random read and write access, as well as read and write access via iterators, to the values and indicies of * training examples stored in an `AllocatedNominalFeatureVector`. */ -class OrdinalFeatureVectorDecorator final : public AbstractNominalFeatureVectorDecorator { +class OrdinalFeatureVectorDecorator final : public AbstractBinnedFeatureVectorDecorator { public: /** @@ -191,13 +171,14 @@ class OrdinalFeatureVectorDecorator final : public AbstractNominalFeatureVectorD */ OrdinalFeatureVectorDecorator(AllocatedNominalFeatureVector&& firstView, AllocatedMissingFeatureVector&& secondView) - : AbstractNominalFeatureVectorDecorator(std::move(firstView), std::move(secondView)) {} + : AbstractBinnedFeatureVectorDecorator(std::move(firstView), + std::move(secondView)) {} /** * @param other A reference to an object of type `OrdinalFeatureVectorDecorator` that should be copied */ OrdinalFeatureVectorDecorator(const OrdinalFeatureVectorDecorator& other) - : AbstractNominalFeatureVectorDecorator(other) {} + : AbstractBinnedFeatureVectorDecorator(other) {} /** * @param other A reference to an object of type `OrdinalFeatureVectorView` that should be copied @@ -219,22 +200,20 @@ class OrdinalFeatureVectorDecorator final : public AbstractNominalFeatureVectorD other.getView().firstView.majorityValue), AllocatedMissingFeatureVector()) {} - void searchForRefinement(RuleRefinementSearch& ruleRefinementSearch, - IWeightedStatisticsSubset& statisticsSubset, SingleRefinementComparator& comparator, - uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, - Refinement& refinement) const override { - ruleRefinementSearch.searchForOrdinalRefinement(this->view.firstView, this->view.secondView, - statisticsSubset, comparator, numExamplesWithNonZeroWeights, - minCoverage, refinement); + void searchForRefinement(FeatureBasedSearch& featureBasedSearch, IWeightedStatisticsSubset& statisticsSubset, + SingleRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, + uint32 minCoverage, Refinement& refinement) const override { + featureBasedSearch.searchForOrdinalRefinement(this->view.firstView, this->view.secondView, statisticsSubset, + comparator, numExamplesWithNonZeroWeights, minCoverage, + refinement); } - void searchForRefinement(RuleRefinementSearch& ruleRefinementSearch, - IWeightedStatisticsSubset& statisticsSubset, FixedRefinementComparator& comparator, - uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, - Refinement& refinement) const override { - ruleRefinementSearch.searchForOrdinalRefinement(this->view.firstView, this->view.secondView, - statisticsSubset, comparator, numExamplesWithNonZeroWeights, - minCoverage, refinement); + void searchForRefinement(FeatureBasedSearch& featureBasedSearch, IWeightedStatisticsSubset& statisticsSubset, + FixedRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, + uint32 minCoverage, Refinement& refinement) const override { + featureBasedSearch.searchForOrdinalRefinement(this->view.firstView, this->view.secondView, statisticsSubset, + comparator, numExamplesWithNonZeroWeights, minCoverage, + refinement); } std::unique_ptr createFilteredFeatureVector(std::unique_ptr& existing, diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_vector_equal.cpp b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_equal.cpp index 61b2c32538..45338ee8d4 100644 --- a/cpp/subprojects/common/src/mlrl/common/input/feature_vector_equal.cpp +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_equal.cpp @@ -1,12 +1,12 @@ #include "mlrl/common/input/feature_vector_equal.hpp" -void EqualFeatureVector::searchForRefinement(RuleRefinementSearch& ruleRefinementSearch, +void EqualFeatureVector::searchForRefinement(FeatureBasedSearch& featureBasedSearch, IWeightedStatisticsSubset& statisticsSubset, SingleRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, Refinement& refinement) const {} -void EqualFeatureVector::searchForRefinement(RuleRefinementSearch& ruleRefinementSearch, +void EqualFeatureVector::searchForRefinement(FeatureBasedSearch& featureBasedSearch, IWeightedStatisticsSubset& statisticsSubset, FixedRefinementComparator& comparator, uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, diff --git a/cpp/subprojects/common/src/mlrl/common/input/missing_feature_vector.cpp b/cpp/subprojects/common/src/mlrl/common/input/missing_feature_vector.cpp deleted file mode 100644 index 52b8f0ec80..0000000000 --- a/cpp/subprojects/common/src/mlrl/common/input/missing_feature_vector.cpp +++ /dev/null @@ -1,28 +0,0 @@ -#include "mlrl/common/input/missing_feature_vector.hpp" - -OldMissingFeatureVector::OldMissingFeatureVector() - : missingIndicesPtr_( - std::make_unique>>(AllocatedBinaryDokVector())) {} - -OldMissingFeatureVector::OldMissingFeatureVector(OldMissingFeatureVector& missingFeatureVector) - : missingIndicesPtr_(std::move(missingFeatureVector.missingIndicesPtr_)) {} - -OldMissingFeatureVector::missing_index_const_iterator OldMissingFeatureVector::missing_indices_cbegin() const { - return missingIndicesPtr_->getView().indices_cbegin(); -} - -OldMissingFeatureVector::missing_index_const_iterator OldMissingFeatureVector::missing_indices_cend() const { - return missingIndicesPtr_->getView().indices_cend(); -} - -void OldMissingFeatureVector::addMissingIndex(uint32 index) { - missingIndicesPtr_->getView().set(index, true); -} - -bool OldMissingFeatureVector::isMissing(uint32 index) const { - return (missingIndicesPtr_->getView())[index]; -} - -void OldMissingFeatureVector::clearMissingIndices() { - missingIndicesPtr_->clear(); -} diff --git a/cpp/subprojects/common/src/mlrl/common/learner.cpp b/cpp/subprojects/common/src/mlrl/common/learner.cpp index b81948daa9..3265ad2c6e 100644 --- a/cpp/subprojects/common/src/mlrl/common/learner.cpp +++ b/cpp/subprojects/common/src/mlrl/common/learner.cpp @@ -1,6 +1,7 @@ #include "mlrl/common/learner.hpp" #include "mlrl/common/prediction/label_space_info_no.hpp" +#include "mlrl/common/rule_refinement/feature_space_tabular.hpp" #include "mlrl/common/stopping/stopping_criterion_size.hpp" #include "mlrl/common/util/validation.hpp" @@ -88,7 +89,7 @@ AbstractRuleLearner::Config::Config(RuleCompareFunction ruleCompareFunction) ruleModelAssemblageConfigPtr_(std::make_unique(defaultRuleConfigPtr_)), ruleInductionConfigPtr_( std::make_unique(ruleCompareFunction_, parallelRuleRefinementConfigPtr_)), - featureBinningConfigPtr_(std::make_unique(parallelStatisticUpdateConfigPtr_)), + featureBinningConfigPtr_(std::make_unique()), labelSamplingConfigPtr_(std::make_unique()), instanceSamplingConfigPtr_(std::make_unique()), featureSamplingConfigPtr_(std::make_unique()), @@ -208,9 +209,13 @@ std::unique_ptr AbstractRuleLearner::createRuleMode return config_.getRuleModelAssemblageConfigPtr()->createRuleModelAssemblageFactory(labelMatrix); } -std::unique_ptr AbstractRuleLearner::createThresholdsFactory( +std::unique_ptr AbstractRuleLearner::createFeatureSpaceFactory( const IFeatureMatrix& featureMatrix, const ILabelMatrix& labelMatrix) const { - return config_.getFeatureBinningConfigPtr()->createThresholdsFactory(featureMatrix, labelMatrix); + std::unique_ptr featureBinningFactoryPtr = + config_.getFeatureBinningConfigPtr()->createFeatureBinningFactory(featureMatrix, labelMatrix); + uint32 numThreads = + config_.getParallelStatisticUpdateConfigPtr()->getNumThreads(featureMatrix, labelMatrix.getNumLabels()); + return std::make_unique(std::move(featureBinningFactoryPtr), numThreads); } std::unique_ptr AbstractRuleLearner::createRuleInductionFactory( @@ -400,11 +405,11 @@ std::unique_ptr AbstractRuleLearner::fit(const IFeatureInfo& fe std::unique_ptr statisticsProviderPtr = labelMatrix.createStatisticsProvider(*statisticsProviderFactoryPtr); - // Create thresholds... - std::unique_ptr thresholdsFactoryPtr = - this->createThresholdsFactory(featureMatrix, labelMatrix); - std::unique_ptr thresholdsPtr = - thresholdsFactoryPtr->create(featureMatrix, featureInfo, *statisticsProviderPtr); + // Create feature space... + std::unique_ptr featureSpaceFactoryPtr = + this->createFeatureSpaceFactory(featureMatrix, labelMatrix); + std::unique_ptr featureSpacePtr = + featureSpaceFactoryPtr->create(featureMatrix, featureInfo, *statisticsProviderPtr); // Create rule induction... std::unique_ptr ruleInductionFactoryPtr = @@ -440,10 +445,10 @@ std::unique_ptr AbstractRuleLearner::fit(const IFeatureInfo& fe ruleModelAssemblageFactoryPtr->create(std::move(stoppingCriterionFactoryPtr)); ruleModelAssemblagePtr->induceRules(*ruleInductionPtr, *rulePruningPtr, *postProcessorPtr, partition, *labelSamplingPtr, *instanceSamplingPtr, *featureSamplingPtr, - *statisticsProviderPtr, *thresholdsPtr, modelBuilder, rng); + *statisticsProviderPtr, *featureSpacePtr, modelBuilder, rng); // Post-optimize the model... - postOptimizationPtr->optimizeModel(*thresholdsPtr, *ruleInductionPtr, partition, *labelSamplingPtr, + postOptimizationPtr->optimizeModel(*featureSpacePtr, *ruleInductionPtr, partition, *labelSamplingPtr, *instanceSamplingPtr, *featureSamplingPtr, *rulePruningPtr, *postProcessorPtr, rng); diff --git a/cpp/subprojects/common/src/mlrl/common/post_optimization/post_optimization_phase_list.cpp b/cpp/subprojects/common/src/mlrl/common/post_optimization/post_optimization_phase_list.cpp index a1bfe77514..2cb0ea4615 100644 --- a/cpp/subprojects/common/src/mlrl/common/post_optimization/post_optimization_phase_list.cpp +++ b/cpp/subprojects/common/src/mlrl/common/post_optimization/post_optimization_phase_list.cpp @@ -36,13 +36,13 @@ class PostOptimizationPhaseList final : public IPostOptimization { return *intermediateModelBuilderPtr_; } - void optimizeModel(IThresholds& thresholds, const IRuleInduction& ruleInduction, IPartition& partition, + void optimizeModel(IFeatureSpace& featureSpace, const IRuleInduction& ruleInduction, IPartition& partition, ILabelSampling& labelSampling, IInstanceSampling& instanceSampling, IFeatureSampling& featureSampling, const IRulePruning& rulePruning, const IPostProcessor& postProcessor, RNG& rng) const override { for (auto it = postOptimizationPhases_.cbegin(); it != postOptimizationPhases_.cend(); it++) { const std::unique_ptr& postOptimizationPhasePtr = *it; - postOptimizationPhasePtr->optimizeModel(thresholds, ruleInduction, partition, labelSampling, + postOptimizationPhasePtr->optimizeModel(featureSpace, ruleInduction, partition, labelSampling, instanceSampling, featureSampling, rulePruning, postProcessor, rng); } @@ -71,7 +71,7 @@ class NoPostOptimization final : public IPostOptimization { return *modelBuilderPtr_; } - void optimizeModel(IThresholds& thresholds, const IRuleInduction& ruleInduction, IPartition& partition, + void optimizeModel(IFeatureSpace& featureSpace, const IRuleInduction& ruleInduction, IPartition& partition, ILabelSampling& labelSampling, IInstanceSampling& instanceSampling, IFeatureSampling& featureSampling, const IRulePruning& rulePruning, const IPostProcessor& postProcessor, RNG& rng) const override { diff --git a/cpp/subprojects/common/src/mlrl/common/post_optimization/post_optimization_sequential.cpp b/cpp/subprojects/common/src/mlrl/common/post_optimization/post_optimization_sequential.cpp index 3be97ccf18..99bd450a50 100644 --- a/cpp/subprojects/common/src/mlrl/common/post_optimization/post_optimization_sequential.cpp +++ b/cpp/subprojects/common/src/mlrl/common/post_optimization/post_optimization_sequential.cpp @@ -67,7 +67,7 @@ class SequentialPostOptimization final : public IPostOptimizationPhase { : modelBuilder_(modelBuilder), numIterations_(numIterations), refineHeads_(refineHeads), resampleFeatures_(resampleFeatures) {} - void optimizeModel(IThresholds& thresholds, const IRuleInduction& ruleInduction, IPartition& partition, + void optimizeModel(IFeatureSpace& featureSpace, const IRuleInduction& ruleInduction, IPartition& partition, ILabelSampling& labelSampling, IInstanceSampling& instanceSampling, IFeatureSampling& featureSampling, const IRulePruning& rulePruning, const IPostProcessor& postProcessor, RNG& rng) const override { @@ -79,23 +79,23 @@ class SequentialPostOptimization final : public IPostOptimizationPhase { // Create a new subset of the given thresholds... const IWeightVector& weights = instanceSampling.sample(rng); - std::unique_ptr thresholdsSubsetPtr = weights.createThresholdsSubset(thresholds); + std::unique_ptr featureSubspacePtr = weights.createFeatureSubspace(featureSpace); // Filter the thresholds subset according to the conditions of the current rule... for (auto it2 = conditionList.cbegin(); it2 != conditionList.cend(); it2++) { const Condition& condition = *it2; - thresholdsSubsetPtr->filterThresholds(condition); + featureSubspacePtr->filterSubspace(condition); } // Revert the statistics based on the predictions of the current rule... - thresholdsSubsetPtr->revertPrediction(prediction); + featureSubspacePtr->revertPrediction(prediction); // Learn a new rule... const IIndexVector& labelIndices = refineHeads_ ? labelSampling.sample(rng) : prediction; RuleReplacementBuilder ruleReplacementBuilder(intermediateRule); if (resampleFeatures_) { - ruleInduction.induceRule(thresholds, labelIndices, weights, partition, featureSampling, + ruleInduction.induceRule(featureSpace, labelIndices, weights, partition, featureSampling, rulePruning, postProcessor, rng, ruleReplacementBuilder); } else { std::unordered_set uniqueFeatureIndices; @@ -114,7 +114,7 @@ class SequentialPostOptimization final : public IPostOptimizationPhase { } PredefinedFeatureSampling predefinedFeatureSampling(indexVector); - ruleInduction.induceRule(thresholds, labelIndices, weights, partition, + ruleInduction.induceRule(featureSpace, labelIndices, weights, partition, predefinedFeatureSampling, rulePruning, postProcessor, rng, ruleReplacementBuilder); } diff --git a/cpp/subprojects/common/src/mlrl/common/post_optimization/post_optimization_unused_rule_removal.cpp b/cpp/subprojects/common/src/mlrl/common/post_optimization/post_optimization_unused_rule_removal.cpp index b3a81cdf87..6ee4ae131c 100644 --- a/cpp/subprojects/common/src/mlrl/common/post_optimization/post_optimization_unused_rule_removal.cpp +++ b/cpp/subprojects/common/src/mlrl/common/post_optimization/post_optimization_unused_rule_removal.cpp @@ -16,7 +16,7 @@ class UnusedRuleRemoval final : public IPostOptimizationPhase { */ UnusedRuleRemoval(IntermediateModelBuilder& modelBuilder) : modelBuilder_(modelBuilder) {} - void optimizeModel(IThresholds& thresholds, const IRuleInduction& ruleInduction, IPartition& partition, + void optimizeModel(IFeatureSpace& featureSpace, const IRuleInduction& ruleInduction, IPartition& partition, ILabelSampling& labelSampling, IInstanceSampling& instanceSampling, IFeatureSampling& featureSampling, const IRulePruning& rulePruning, const IPostProcessor& postProcessor, RNG& rng) const override { diff --git a/cpp/subprojects/common/src/mlrl/common/rule_induction/rule_induction_common.hpp b/cpp/subprojects/common/src/mlrl/common/rule_induction/rule_induction_common.hpp index f81c5e6986..29a81e4102 100644 --- a/cpp/subprojects/common/src/mlrl/common/rule_induction/rule_induction_common.hpp +++ b/cpp/subprojects/common/src/mlrl/common/rule_induction/rule_induction_common.hpp @@ -20,8 +20,8 @@ class AbstractRuleInduction : public IRuleInduction { /** * Must be implemented by subclasses in order to grow a rule. * - * @param thresholds A reference to an object of type `IThresholds` that provides access to the - * thresholds that may be used by the conditions of the rule + * @param featureSpace A reference to an object of type `IFeatureSpace` that provides access to the feature + * space * @param labelIndices A reference to an object of type `IIndexVector` that provides access to the indices * of the labels for which the rule may predict * @param weights A reference to an object of type `IWeightVector` that provides access to the weights @@ -37,14 +37,15 @@ class AbstractRuleInduction : public IRuleInduction { * store the conditions of the rule * @param headPtr A reference to an unique pointer of type `IEvaluatedPrediction` that should be used * to store the head of the rule - * @return An unique pointer to an object of type `IThresholdsSubset` that has been used to + * @return An unique pointer to an object of type `IFeatureSubspace` that has been used to * grow the rule */ - virtual std::unique_ptr growRule(IThresholds& thresholds, const IIndexVector& labelIndices, - const IWeightVector& weights, IPartition& partition, - IFeatureSampling& featureSampling, RNG& rng, - std::unique_ptr& conditionListPtr, - std::unique_ptr& headPtr) const = 0; + virtual std::unique_ptr growRule(IFeatureSpace& featureSpace, + const IIndexVector& labelIndices, + const IWeightVector& weights, IPartition& partition, + IFeatureSampling& featureSampling, RNG& rng, + std::unique_ptr& conditionListPtr, + std::unique_ptr& headPtr) const = 0; public: @@ -79,29 +80,29 @@ class AbstractRuleInduction : public IRuleInduction { modelBuilder.setDefaultRule(defaultPredictionPtr); } - bool induceRule(IThresholds& thresholds, const IIndexVector& labelIndices, const IWeightVector& weights, + bool induceRule(IFeatureSpace& featureSpace, const IIndexVector& labelIndices, const IWeightVector& weights, IPartition& partition, IFeatureSampling& featureSampling, const IRulePruning& rulePruning, const IPostProcessor& postProcessor, RNG& rng, IModelBuilder& modelBuilder) const override final { std::unique_ptr conditionListPtr; std::unique_ptr headPtr; - std::unique_ptr thresholdsSubsetPtr = this->growRule( - thresholds, labelIndices, weights, partition, featureSampling, rng, conditionListPtr, headPtr); + std::unique_ptr featureSubspacePtr = this->growRule( + featureSpace, labelIndices, weights, partition, featureSampling, rng, conditionListPtr, headPtr); if (headPtr) { if (weights.hasZeroWeights()) { // Prune rule... - IStatisticsProvider& statisticsProvider = thresholds.getStatisticsProvider(); + IStatisticsProvider& statisticsProvider = featureSpace.getStatisticsProvider(); statisticsProvider.switchToPruningRuleEvaluation(); - std::unique_ptr coverageStatePtr = - rulePruning.prune(*thresholdsSubsetPtr, partition, *conditionListPtr, *headPtr); + std::unique_ptr coverageMaskPtr = + rulePruning.prune(*featureSubspacePtr, partition, *conditionListPtr, *headPtr); statisticsProvider.switchToRegularRuleEvaluation(); // Re-calculate the scores in the head based on the entire training data... if (recalculatePredictions_) { - const ICoverageState& coverageState = - coverageStatePtr ? *coverageStatePtr : thresholdsSubsetPtr->getCoverageState(); - partition.recalculatePrediction(*thresholdsSubsetPtr, coverageState, *headPtr); + const CoverageMask& coverageMask = + coverageMaskPtr ? *coverageMaskPtr : featureSubspacePtr->getCoverageMask(); + partition.recalculatePrediction(*featureSubspacePtr, coverageMask, *headPtr); } } @@ -109,7 +110,7 @@ class AbstractRuleInduction : public IRuleInduction { headPtr->postProcess(postProcessor); // Update the statistics by applying the predictions of the new rule... - thresholdsSubsetPtr->applyPrediction(*headPtr); + featureSubspacePtr->applyPrediction(*headPtr); // Add the induced rule to the model... modelBuilder.addRule(conditionListPtr, headPtr); diff --git a/cpp/subprojects/common/src/mlrl/common/rule_induction/rule_induction_top_down_beam_search.cpp b/cpp/subprojects/common/src/mlrl/common/rule_induction/rule_induction_top_down_beam_search.cpp index 27b20aafb1..a44c63fa4f 100644 --- a/cpp/subprojects/common/src/mlrl/common/rule_induction/rule_induction_top_down_beam_search.cpp +++ b/cpp/subprojects/common/src/mlrl/common/rule_induction/rule_induction_top_down_beam_search.cpp @@ -9,7 +9,7 @@ /** * A single entry of a beam, corresponding to a rule that may be further refined. It stores the conditions and the head - * of the current rule, as well as an object of type `IThresholdsSubset` that is required to search for potential + * of the current rule, as well as an object of type `IFeatureSubspace` that is required to search for potential * refinements of the rule and an `IIndexVector` that provides access to the indices of the labels for which these * refinements may predict. */ @@ -28,10 +28,10 @@ struct BeamEntry final { std::unique_ptr headPtr; /** - * An unique pointer to an object of type `IThresholdsSubset` that may be used to search for potential + * An unique pointer to an object of type `IFeatureSubspace` that may be used to search for potential * refinements of the rule. */ - std::unique_ptr thresholdsSubsetPtr; + std::unique_ptr featureSubspacePtr; /** * A pointer to an object of type `IIndexVector` that provides access to the indices of the labels for which @@ -41,10 +41,10 @@ struct BeamEntry final { }; static inline void initializeEntry(BeamEntry& entry, Refinement& refinement, - std::unique_ptr thresholdsSubsetPtr, + std::unique_ptr featureSubspacePtr, const IIndexVector& labelIndices, bool keepHead) { - thresholdsSubsetPtr->filterThresholds(refinement); - entry.thresholdsSubsetPtr = std::move(thresholdsSubsetPtr); + featureSubspacePtr->filterSubspace(refinement); + entry.featureSubspacePtr = std::move(featureSubspacePtr); entry.conditionListPtr = std::make_unique(); entry.conditionListPtr->addCondition(refinement); entry.headPtr = std::move(refinement.headPtr); @@ -52,10 +52,10 @@ static inline void initializeEntry(BeamEntry& entry, Refinement& refinement, } static inline void copyEntry(BeamEntry& newEntry, BeamEntry& oldEntry, Refinement& refinement, - std::unique_ptr thresholdsSubsetPtr, + std::unique_ptr featureSubspacePtr, std::unique_ptr conditionListPtr, bool keepHead, uint32 minCoverage) { - thresholdsSubsetPtr->filterThresholds(refinement); - newEntry.thresholdsSubsetPtr = std::move(thresholdsSubsetPtr); + featureSubspacePtr->filterSubspace(refinement); + newEntry.featureSubspacePtr = std::move(featureSubspacePtr); newEntry.conditionListPtr = std::move(conditionListPtr); newEntry.conditionListPtr->addCondition(refinement); newEntry.headPtr = std::move(refinement.headPtr); @@ -68,7 +68,7 @@ static inline void copyEntry(BeamEntry& newEntry, BeamEntry& oldEntry, Refinemen } static inline void copyEntry(BeamEntry& newEntry, BeamEntry& oldEntry) { - newEntry.thresholdsSubsetPtr = std::move(oldEntry.thresholdsSubsetPtr); + newEntry.featureSubspacePtr = std::move(oldEntry.featureSubspacePtr); newEntry.conditionListPtr = std::move(oldEntry.conditionListPtr); newEntry.headPtr = std::move(oldEntry.headPtr); newEntry.labelIndices = nullptr; @@ -107,14 +107,14 @@ class Beam final { /** * @param refinementComparator A reference to an object of type `FixedRefinementComparator` that keeps track of * existing refinements of rules - * @param thresholdsSubsetPtr An unique pointer to an object of type `IThresholdsSubset` that has been used to + * @param featureSubspacePtr An unique pointer to an object of type `IFeatureSubspace` that has been used to * find the existing refinements of rules * @param labelIndices A reference to an object of type `IIndexVector` that provides access to the * indices of the labels for which further refinement may predict * @param keepHeads True, if further refinements should predict for the same labels as before, false * otherwise */ - Beam(FixedRefinementComparator& refinementComparator, std::unique_ptr thresholdsSubsetPtr, + Beam(FixedRefinementComparator& refinementComparator, std::unique_ptr featureSubspacePtr, const IIndexVector& labelIndices, bool keepHeads) : Beam(refinementComparator.getNumElements()) { FixedRefinementComparator::iterator iterator = refinementComparator.begin(); @@ -123,13 +123,13 @@ class Beam final { for (; i < numEntries_ - 1; i++) { Refinement& refinement = iterator[i]; BeamEntry& entry = entries_[i]; - initializeEntry(entry, refinement, thresholdsSubsetPtr->copy(), labelIndices, keepHeads); + initializeEntry(entry, refinement, featureSubspacePtr->copy(), labelIndices, keepHeads); order_.push_back(entry); } Refinement& refinement = iterator[i]; BeamEntry& entry = entries_[i]; - initializeEntry(entry, refinement, std::move(thresholdsSubsetPtr), labelIndices, keepHeads); + initializeEntry(entry, refinement, std::move(featureSubspacePtr), labelIndices, keepHeads); order_.push_back(entry); } @@ -181,7 +181,7 @@ class Beam final { // Search for refinements of the existing beam entry... FixedRefinementComparator refinementComparator(ruleCompareFunction, beamWidth, minQuality); - foundRefinement = findRefinement(refinementComparator, *entry.thresholdsSubsetPtr, featureIndices, + foundRefinement = findRefinement(refinementComparator, *entry.featureSubspacePtr, featureIndices, *entry.labelIndices, minCoverage, numThreads); if (foundRefinement) { @@ -191,39 +191,39 @@ class Beam final { uint32 i = 0; // Include all refinements, except for the last one, in the new beam. The corresponding - // `IThresholdsSubset` and `ConditionList` are copied... + // `IFeatureSubspace` and `ConditionList` are copied... for (; i < numRefinements - 1; i++) { Refinement& refinement = iterator[i]; if (n < beamWidth) { BeamEntry& newEntry = newEntries[n]; - copyEntry(newEntry, entry, refinement, entry.thresholdsSubsetPtr->copy(), + copyEntry(newEntry, entry, refinement, entry.featureSubspacePtr->copy(), std::make_unique(*entry.conditionListPtr), keepHeads, minCoverage); newOrder.push_back(newEntry); n++; } else { BeamEntry& newEntry = newOrder.back(); - copyEntry(newEntry, entry, refinement, entry.thresholdsSubsetPtr->copy(), + copyEntry(newEntry, entry, refinement, entry.featureSubspacePtr->copy(), std::make_unique(*entry.conditionListPtr), keepHeads, minCoverage); minQuality = updateOrder(ruleCompareFunction, newOrder); } } - // Include the last refinement in the beam. The corresponding `IThresholdsSubset` and + // Include the last refinement in the beam. The corresponding `IFeatureSubspace` and // `ConditionList` are reused... Refinement& refinement = iterator[i]; if (n < beamWidth) { BeamEntry& newEntry = newEntries[n]; - copyEntry(newEntry, entry, refinement, std::move(entry.thresholdsSubsetPtr), + copyEntry(newEntry, entry, refinement, std::move(entry.featureSubspacePtr), std::move(entry.conditionListPtr), keepHeads, minCoverage); newOrder.push_back(newEntry); n++; } else { BeamEntry& newEntry = newOrder.back(); - copyEntry(newEntry, entry, refinement, std::move(entry.thresholdsSubsetPtr), + copyEntry(newEntry, entry, refinement, std::move(entry.featureSubspacePtr), std::move(entry.conditionListPtr), keepHeads, minCoverage); minQuality = updateOrder(ruleCompareFunction, newOrder); } @@ -312,26 +312,26 @@ class BeamSearchTopDownRuleInduction final : public AbstractRuleInduction { protected: - std::unique_ptr growRule(IThresholds& thresholds, const IIndexVector& labelIndices, - const IWeightVector& weights, IPartition& partition, - IFeatureSampling& featureSampling, RNG& rng, - std::unique_ptr& conditionListPtr, - std::unique_ptr& headPtr) const override { + std::unique_ptr growRule(IFeatureSpace& featureSpace, const IIndexVector& labelIndices, + const IWeightVector& weights, IPartition& partition, + IFeatureSampling& featureSampling, RNG& rng, + std::unique_ptr& conditionListPtr, + std::unique_ptr& headPtr) const override { // Create a new subset of the given thresholds... - std::unique_ptr thresholdsSubsetPtr = weights.createThresholdsSubset(thresholds); + std::unique_ptr featureSubspacePtr = weights.createFeatureSubspace(featureSpace); // Sample features... const IIndexVector& sampledFeatureIndices = featureSampling.sample(rng); // Search for the best refinements using a single condition... FixedRefinementComparator refinementComparator(ruleCompareFunction_, beamWidth_); - bool foundRefinement = findRefinement(refinementComparator, *thresholdsSubsetPtr, sampledFeatureIndices, + bool foundRefinement = findRefinement(refinementComparator, *featureSubspacePtr, sampledFeatureIndices, labelIndices, minCoverage_, numThreads_); if (foundRefinement) { bool keepHeads = maxHeadRefinements_ == 1; std::unique_ptr beamPtr = - std::make_unique(refinementComparator, std::move(thresholdsSubsetPtr), labelIndices, keepHeads); + std::make_unique(refinementComparator, std::move(featureSubspacePtr), labelIndices, keepHeads); uint32 searchDepth = 1; while (foundRefinement && (maxConditions_ == 0 || searchDepth < maxConditions_)) { @@ -351,10 +351,10 @@ class BeamSearchTopDownRuleInduction final : public AbstractRuleInduction { BeamEntry& entry = beamPtr->getBestEntry(); conditionListPtr = std::move(entry.conditionListPtr); headPtr = std::move(entry.headPtr); - return std::move(entry.thresholdsSubsetPtr); + return std::move(entry.featureSubspacePtr); } - return thresholdsSubsetPtr; + return featureSubspacePtr; } }; diff --git a/cpp/subprojects/common/src/mlrl/common/rule_induction/rule_induction_top_down_common.hpp b/cpp/subprojects/common/src/mlrl/common/rule_induction/rule_induction_top_down_common.hpp index a45e639985..d80504fc1e 100644 --- a/cpp/subprojects/common/src/mlrl/common/rule_induction/rule_induction_top_down_common.hpp +++ b/cpp/subprojects/common/src/mlrl/common/rule_induction/rule_induction_top_down_common.hpp @@ -3,7 +3,7 @@ */ #pragma once -#include "mlrl/common/thresholds/thresholds_subset.hpp" +#include "mlrl/common/rule_refinement/feature_subspace.hpp" #include "mlrl/common/util/openmp.hpp" /** @@ -14,7 +14,7 @@ * @tparam The type of the comparator that allows comparing different refinements and keeping track of the best one(s) */ template -struct RuleRefinement final { +struct RuleRefinementEntry final { public: /** @@ -36,7 +36,7 @@ struct RuleRefinement final { * @tparam RefinementComparator The type of the comparator that is used to compare the potential refinements * @param refinementComparator A reference to an object of template type `RefinementComparator` that should be used to * compare the potential refinements - * @param thresholdsSubset A reference to an object of type `IThresholdsSubset` that should be used to search for + * @param featureSubspace A reference to an object of type `IFeatureSubspace` that should be used to search for * the potential refinements * @param featureIndices A reference to an object of type `IIndexVector` that provides access to the indices of * the features that should be considered @@ -48,38 +48,39 @@ struct RuleRefinement final { * @return True, if at least one refinement has been found, false otherwise */ template -static inline bool findRefinement(RefinementComparator& refinementComparator, IThresholdsSubset& thresholdsSubset, +static inline bool findRefinement(RefinementComparator& refinementComparator, IFeatureSubspace& featureSubspace, const IIndexVector& featureIndices, const IIndexVector& labelIndices, uint32 minCoverage, uint32 numThreads) { bool foundRefinement = false; // For each feature, create an object of type `RuleRefinement`... uint32 numFeatures = featureIndices.getNumElements(); - RuleRefinement* ruleRefinements = new RuleRefinement[numFeatures]; + RuleRefinementEntry* ruleRefinementEntries = + new RuleRefinementEntry[numFeatures]; for (uint32 i = 0; i < numFeatures; i++) { uint32 featureIndex = featureIndices.getIndex(i); - RuleRefinement& ruleRefinement = ruleRefinements[i]; - ruleRefinement.comparatorPtr = std::make_unique(refinementComparator); - ruleRefinement.ruleRefinementPtr = labelIndices.createRuleRefinement(thresholdsSubset, featureIndex); + RuleRefinementEntry& ruleRefinementEntry = ruleRefinementEntries[i]; + ruleRefinementEntry.comparatorPtr = std::make_unique(refinementComparator); + ruleRefinementEntry.ruleRefinementPtr = labelIndices.createRuleRefinement(featureSubspace, featureIndex); } // Search for the best condition among all available features to be added to the current rule... #if MULTI_THREADING_SUPPORT_ENABLED - #pragma omp parallel for firstprivate(numFeatures) firstprivate(ruleRefinements) firstprivate(minCoverage) \ + #pragma omp parallel for firstprivate(numFeatures) firstprivate(ruleRefinementEntries) firstprivate(minCoverage) \ schedule(dynamic) num_threads(numThreads) #endif for (int64 i = 0; i < numFeatures; i++) { - RuleRefinement& ruleRefinement = ruleRefinements[i]; - ruleRefinement.ruleRefinementPtr->findRefinement(*ruleRefinement.comparatorPtr, minCoverage); + RuleRefinementEntry& ruleRefinementEntry = ruleRefinementEntries[i]; + ruleRefinementEntry.ruleRefinementPtr->findRefinement(*ruleRefinementEntry.comparatorPtr, minCoverage); } // Pick the best refinement among the refinements that have been found for the different features... for (uint32 i = 0; i < numFeatures; i++) { - RuleRefinement& ruleRefinement = ruleRefinements[i]; - foundRefinement |= refinementComparator.merge(*ruleRefinement.comparatorPtr); + RuleRefinementEntry& ruleRefinementEntry = ruleRefinementEntries[i]; + foundRefinement |= refinementComparator.merge(*ruleRefinementEntry.comparatorPtr); } - delete[] ruleRefinements; + delete[] ruleRefinementEntries; return foundRefinement; } diff --git a/cpp/subprojects/common/src/mlrl/common/rule_induction/rule_induction_top_down_greedy.cpp b/cpp/subprojects/common/src/mlrl/common/rule_induction/rule_induction_top_down_greedy.cpp index 56593dfcff..bb5abf42f1 100644 --- a/cpp/subprojects/common/src/mlrl/common/rule_induction/rule_induction_top_down_greedy.cpp +++ b/cpp/subprojects/common/src/mlrl/common/rule_induction/rule_induction_top_down_greedy.cpp @@ -47,11 +47,11 @@ class GreedyTopDownRuleInduction final : public AbstractRuleInduction { protected: - std::unique_ptr growRule(IThresholds& thresholds, const IIndexVector& labelIndices, - const IWeightVector& weights, IPartition& partition, - IFeatureSampling& featureSampling, RNG& rng, - std::unique_ptr& conditionListPtr, - std::unique_ptr& headPtr) const override { + std::unique_ptr growRule(IFeatureSpace& featureSpace, const IIndexVector& labelIndices, + const IWeightVector& weights, IPartition& partition, + IFeatureSampling& featureSampling, RNG& rng, + std::unique_ptr& conditionListPtr, + std::unique_ptr& headPtr) const override { // The label indices for which the next refinement of the rule may predict const IIndexVector* currentLabelIndices = &labelIndices; // A list that contains the conditions in the rule's body (in the order they have been learned) @@ -62,7 +62,7 @@ class GreedyTopDownRuleInduction final : public AbstractRuleInduction { bool foundRefinement = true; // Create a new subset of the given thresholds... - std::unique_ptr thresholdsSubsetPtr = weights.createThresholdsSubset(thresholds); + std::unique_ptr featureSubspacePtr = weights.createFeatureSubspace(featureSpace); // Search for the best refinement until no improvement in terms of the rule's quality is possible anymore or // until the maximum number of conditions has been reached... @@ -71,7 +71,7 @@ class GreedyTopDownRuleInduction final : public AbstractRuleInduction { const IIndexVector& sampledFeatureIndices = featureSampling.sample(rng); // Search for the best refinement... - foundRefinement = findRefinement(refinementComparator, *thresholdsSubsetPtr, sampledFeatureIndices, + foundRefinement = findRefinement(refinementComparator, *featureSubspacePtr, sampledFeatureIndices, *currentLabelIndices, minCoverage_, numThreads_); if (foundRefinement) { @@ -81,7 +81,7 @@ class GreedyTopDownRuleInduction final : public AbstractRuleInduction { bestRefinement.headPtr->sort(); // Filter the current subset of thresholds by applying the best refinement that has been found... - thresholdsSubsetPtr->filterThresholds(bestRefinement); + featureSubspacePtr->filterSubspace(bestRefinement); // Add the new condition... conditionListPtr->addCondition(bestRefinement); @@ -100,7 +100,7 @@ class GreedyTopDownRuleInduction final : public AbstractRuleInduction { Refinement& bestRefinement = *refinementComparator.begin(); headPtr = std::move(bestRefinement.headPtr); - return thresholdsSubsetPtr; + return featureSubspacePtr; } }; diff --git a/cpp/subprojects/common/src/mlrl/common/rule_model_assemblage/rule_model_assemblage_sequential.cpp b/cpp/subprojects/common/src/mlrl/common/rule_model_assemblage/rule_model_assemblage_sequential.cpp index 19601b62a8..eea6812345 100644 --- a/cpp/subprojects/common/src/mlrl/common/rule_model_assemblage/rule_model_assemblage_sequential.cpp +++ b/cpp/subprojects/common/src/mlrl/common/rule_model_assemblage/rule_model_assemblage_sequential.cpp @@ -26,8 +26,8 @@ class SequentialRuleModelAssemblage final : public IRuleModelAssemblage { void induceRules(const IRuleInduction& ruleInduction, const IRulePruning& rulePruning, const IPostProcessor& postProcessor, IPartition& partition, ILabelSampling& labelSampling, IInstanceSampling& instanceSampling, IFeatureSampling& featureSampling, - IStatisticsProvider& statisticsProvider, IThresholds& thresholds, IModelBuilder& modelBuilder, - RNG& rng) const override { + IStatisticsProvider& statisticsProvider, IFeatureSpace& featureSpace, + IModelBuilder& modelBuilder, RNG& rng) const override { uint32 numRules = useDefaultRule_ ? 1 : 0; uint32 numUsedRules = 0; @@ -56,7 +56,7 @@ class SequentialRuleModelAssemblage final : public IRuleModelAssemblage { const IWeightVector& weights = instanceSampling.sample(rng); const IIndexVector& labelIndices = labelSampling.sample(rng); - bool success = ruleInduction.induceRule(thresholds, labelIndices, weights, partition, featureSampling, + bool success = ruleInduction.induceRule(featureSpace, labelIndices, weights, partition, featureSampling, rulePruning, postProcessor, rng, modelBuilder); if (success) { diff --git a/cpp/subprojects/common/src/mlrl/common/rule_pruning/rule_pruning_irep.cpp b/cpp/subprojects/common/src/mlrl/common/rule_pruning/rule_pruning_irep.cpp index 902fe52586..f2bff6cb71 100644 --- a/cpp/subprojects/common/src/mlrl/common/rule_pruning/rule_pruning_irep.cpp +++ b/cpp/subprojects/common/src/mlrl/common/rule_pruning/rule_pruning_irep.cpp @@ -17,22 +17,22 @@ class Irep final : public IRulePruning { */ Irep(RuleCompareFunction ruleCompareFunction) : ruleCompareFunction_(ruleCompareFunction) {} - std::unique_ptr prune(IThresholdsSubset& thresholdsSubset, IPartition& partition, - ConditionList& conditions, const IPrediction& head) const override { + std::unique_ptr prune(IFeatureSubspace& featureSubspace, IPartition& partition, + ConditionList& conditions, const IPrediction& head) const override { uint32 numConditions = conditions.getNumConditions(); - std::unique_ptr bestCoverageStatePtr; + std::unique_ptr bestCoverageMaskPtr; // Only rules with more than one condition can be pruned... if (numConditions > 1) { // Calculate the quality of the original rule on the prune set... - const ICoverageState& originalCoverageState = thresholdsSubset.getCoverageState(); - Quality bestQuality = partition.evaluateOutOfSample(thresholdsSubset, originalCoverageState, head); + const CoverageMask& originalCoverageMask = featureSubspace.getCoverageMask(); + Quality bestQuality = partition.evaluateOutOfSample(featureSubspace, originalCoverageMask, head); // Create a copy of the original coverage mask... - bestCoverageStatePtr = originalCoverageState.copy(); + bestCoverageMaskPtr = std::make_unique(originalCoverageMask); // Reset the given thresholds... - thresholdsSubset.resetThresholds(); + featureSubspace.resetSubspace(); // We process the existing rule's conditions (except for the last one) in the order they have been // learned. At each iteration, we calculate the quality of a rule that only contains the conditions @@ -43,18 +43,18 @@ class Irep final : public IRulePruning { for (uint32 n = 1; n < numConditions; n++) { // Filter the thresholds by applying the current condition... const Condition& condition = *conditionIterator; - thresholdsSubset.filterThresholds(condition); + featureSubspace.filterSubspace(condition); // Calculate the quality of a rule that contains the conditions that have been processed so far... - const ICoverageState& coverageState = thresholdsSubset.getCoverageState(); - Quality quality = partition.evaluateOutOfSample(thresholdsSubset, coverageState, head); + const CoverageMask& coverageMask = featureSubspace.getCoverageMask(); + Quality quality = partition.evaluateOutOfSample(featureSubspace, coverageMask, head); // Check if the quality is better than the best quality seen so far (reaching the same quality with // fewer conditions is considered an improvement)... if (ruleCompareFunction_.compare(quality, bestQuality) || (numPrunedConditions == 0 && !ruleCompareFunction_.compare(bestQuality, quality))) { bestQuality = quality; - bestCoverageStatePtr = coverageState.copy(); + bestCoverageMaskPtr = std::make_unique(coverageMask); numPrunedConditions = (numConditions - n); } @@ -68,7 +68,7 @@ class Irep final : public IRulePruning { } } - return bestCoverageStatePtr; + return bestCoverageMaskPtr; } }; diff --git a/cpp/subprojects/common/src/mlrl/common/rule_pruning/rule_pruning_no.cpp b/cpp/subprojects/common/src/mlrl/common/rule_pruning/rule_pruning_no.cpp index 83df8c56d4..3f895bf532 100644 --- a/cpp/subprojects/common/src/mlrl/common/rule_pruning/rule_pruning_no.cpp +++ b/cpp/subprojects/common/src/mlrl/common/rule_pruning/rule_pruning_no.cpp @@ -6,8 +6,8 @@ class NoRulePruning final : public IRulePruning { public: - std::unique_ptr prune(IThresholdsSubset& thresholdsSubset, IPartition& partition, - ConditionList& conditions, const IPrediction& head) const override { + std::unique_ptr prune(IFeatureSubspace& featureSubspace, IPartition& partition, + ConditionList& conditions, const IPrediction& head) const override { return nullptr; } }; diff --git a/cpp/subprojects/common/src/mlrl/common/rule_refinement/coverage_mask.cpp b/cpp/subprojects/common/src/mlrl/common/rule_refinement/coverage_mask.cpp new file mode 100644 index 0000000000..8f6d20b15a --- /dev/null +++ b/cpp/subprojects/common/src/mlrl/common/rule_refinement/coverage_mask.cpp @@ -0,0 +1,22 @@ +#include "mlrl/common/rule_refinement/coverage_mask.hpp" + +#include "mlrl/common/rule_refinement/feature_subspace.hpp" +#include "mlrl/common/rule_refinement/prediction.hpp" + +CoverageMask::CoverageMask(uint32 numElements) + : DenseVectorDecorator>(AllocatedVector(numElements, true)), indicatorValue(0) {} + +CoverageMask::CoverageMask(const CoverageMask& other) + : DenseVectorDecorator>(AllocatedVector(other.getNumElements())), + indicatorValue(other.indicatorValue) { + copyView(other.cbegin(), this->begin(), this->getNumElements()); +} + +void CoverageMask::reset() { + indicatorValue = 0; + setViewToZeros(this->begin(), this->getNumElements()); +} + +bool CoverageMask::operator[](uint32 index) const { + return this->view.array[index] == indicatorValue; +} diff --git a/cpp/subprojects/common/src/mlrl/common/rule_refinement/feature_based_search.cpp b/cpp/subprojects/common/src/mlrl/common/rule_refinement/feature_based_search.cpp new file mode 100644 index 0000000000..273633cd52 --- /dev/null +++ b/cpp/subprojects/common/src/mlrl/common/rule_refinement/feature_based_search.cpp @@ -0,0 +1,125 @@ +#include "mlrl/common/rule_refinement/feature_based_search.hpp" + +#include "feature_based_search_binary.hpp" +#include "feature_based_search_binned.hpp" +#include "feature_based_search_nominal.hpp" +#include "feature_based_search_numerical.hpp" +#include "feature_based_search_ordinal.hpp" + +static inline void addMissingStatistics(IWeightedStatisticsSubset& statisticsSubset, + const MissingFeatureVector& missingFeatureVector) { + for (auto it = missingFeatureVector.indices_cbegin(); it != missingFeatureVector.indices_cend(); it++) { + uint32 index = *it; + statisticsSubset.addToMissing(index); + } +} + +void FeatureBasedSearch::searchForNumericalRefinement(const NumericalFeatureVector& featureVector, + const MissingFeatureVector& missingFeatureVector, + IWeightedStatisticsSubset& statisticsSubset, + SingleRefinementComparator& comparator, + uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, + Refinement& refinement) const { + addMissingStatistics(statisticsSubset, missingFeatureVector); + searchForNumericalRefinementInternally(featureVector, statisticsSubset, comparator, numExamplesWithNonZeroWeights, + minCoverage, refinement); +} + +void FeatureBasedSearch::searchForNumericalRefinement(const NumericalFeatureVector& featureVector, + const MissingFeatureVector& missingFeatureVector, + IWeightedStatisticsSubset& statisticsSubset, + FixedRefinementComparator& comparator, + uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, + Refinement& refinement) const { + addMissingStatistics(statisticsSubset, missingFeatureVector); + searchForNumericalRefinementInternally(featureVector, statisticsSubset, comparator, numExamplesWithNonZeroWeights, + minCoverage, refinement); +} + +void FeatureBasedSearch::searchForNominalRefinement(const NominalFeatureVector& featureVector, + const MissingFeatureVector& missingFeatureVector, + IWeightedStatisticsSubset& statisticsSubset, + SingleRefinementComparator& comparator, + uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, + Refinement& refinement) const { + addMissingStatistics(statisticsSubset, missingFeatureVector); + searchForNominalRefinementInternally(featureVector, statisticsSubset, comparator, numExamplesWithNonZeroWeights, + minCoverage, refinement); +} + +void FeatureBasedSearch::searchForNominalRefinement(const NominalFeatureVector& featureVector, + const MissingFeatureVector& missingFeatureVector, + IWeightedStatisticsSubset& statisticsSubset, + FixedRefinementComparator& comparator, + uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, + Refinement& refinement) const { + addMissingStatistics(statisticsSubset, missingFeatureVector); + searchForNominalRefinementInternally(featureVector, statisticsSubset, comparator, numExamplesWithNonZeroWeights, + minCoverage, refinement); +} + +void FeatureBasedSearch::searchForBinaryRefinement(const BinaryFeatureVector& featureVector, + const MissingFeatureVector& missingFeatureVector, + IWeightedStatisticsSubset& statisticsSubset, + SingleRefinementComparator& comparator, + uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, + Refinement& refinement) const { + addMissingStatistics(statisticsSubset, missingFeatureVector); + searchForBinaryRefinementInternally(featureVector, statisticsSubset, comparator, numExamplesWithNonZeroWeights, + minCoverage, refinement); +} + +void FeatureBasedSearch::searchForBinaryRefinement(const BinaryFeatureVector& featureVector, + const MissingFeatureVector& missingFeatureVector, + IWeightedStatisticsSubset& statisticsSubset, + FixedRefinementComparator& comparator, + uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, + Refinement& refinement) const { + addMissingStatistics(statisticsSubset, missingFeatureVector); + searchForBinaryRefinementInternally(featureVector, statisticsSubset, comparator, numExamplesWithNonZeroWeights, + minCoverage, refinement); +} + +void FeatureBasedSearch::searchForOrdinalRefinement(const OrdinalFeatureVector& featureVector, + const MissingFeatureVector& missingFeatureVector, + IWeightedStatisticsSubset& statisticsSubset, + SingleRefinementComparator& comparator, + uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, + Refinement& refinement) const { + addMissingStatistics(statisticsSubset, missingFeatureVector); + searchForOrdinalRefinementInternally(featureVector, statisticsSubset, comparator, numExamplesWithNonZeroWeights, + minCoverage, refinement); +} + +void FeatureBasedSearch::searchForOrdinalRefinement(const OrdinalFeatureVector& featureVector, + const MissingFeatureVector& missingFeatureVector, + IWeightedStatisticsSubset& statisticsSubset, + FixedRefinementComparator& comparator, + uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, + Refinement& refinement) const { + addMissingStatistics(statisticsSubset, missingFeatureVector); + searchForOrdinalRefinementInternally(featureVector, statisticsSubset, comparator, numExamplesWithNonZeroWeights, + minCoverage, refinement); +} + +void FeatureBasedSearch::searchForBinnedRefinement(const BinnedFeatureVector& featureVector, + const MissingFeatureVector& missingFeatureVector, + IWeightedStatisticsSubset& statisticsSubset, + SingleRefinementComparator& comparator, + uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, + Refinement& refinement) const { + addMissingStatistics(statisticsSubset, missingFeatureVector); + searchForBinnedRefinementInternally(featureVector, statisticsSubset, comparator, numExamplesWithNonZeroWeights, + minCoverage, refinement); +} + +void FeatureBasedSearch::searchForBinnedRefinement(const BinnedFeatureVector& featureVector, + const MissingFeatureVector& missingFeatureVector, + IWeightedStatisticsSubset& statisticsSubset, + FixedRefinementComparator& comparator, + uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, + Refinement& refinement) const { + addMissingStatistics(statisticsSubset, missingFeatureVector); + searchForBinnedRefinementInternally(featureVector, statisticsSubset, comparator, numExamplesWithNonZeroWeights, + minCoverage, refinement); +} diff --git a/cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_search_binary.hpp b/cpp/subprojects/common/src/mlrl/common/rule_refinement/feature_based_search_binary.hpp similarity index 97% rename from cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_search_binary.hpp rename to cpp/subprojects/common/src/mlrl/common/rule_refinement/feature_based_search_binary.hpp index 922a4f9540..fd3ddeccd2 100644 --- a/cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_search_binary.hpp +++ b/cpp/subprojects/common/src/mlrl/common/rule_refinement/feature_based_search_binary.hpp @@ -3,9 +3,9 @@ */ #pragma once +#include "feature_based_search_binned_common.hpp" #include "mlrl/common/input/feature_vector_binary.hpp" #include "mlrl/common/rule_refinement/refinement.hpp" -#include "rule_refinement_search_nominal_common.hpp" template static inline void searchForBinaryRefinementInternally(const BinaryFeatureVector& featureVector, diff --git a/cpp/subprojects/common/src/mlrl/common/rule_refinement/feature_based_search_binned.hpp b/cpp/subprojects/common/src/mlrl/common/rule_refinement/feature_based_search_binned.hpp new file mode 100644 index 0000000000..a39fcf74db --- /dev/null +++ b/cpp/subprojects/common/src/mlrl/common/rule_refinement/feature_based_search_binned.hpp @@ -0,0 +1,206 @@ +/* + * @author Michael Rapp (michael.rapp.ml@gmail.com) + */ +#pragma once + +#include "feature_based_search_binned_common.hpp" +#include "mlrl/common/input/feature_vector_binned.hpp" +#include "mlrl/common/rule_refinement/refinement.hpp" +#include "mlrl/common/statistics/statistics_subset_weighted.hpp" + +template +static inline void searchForBinnedRefinementInternally(const BinnedFeatureVector& featureVector, + IWeightedStatisticsSubset& statisticsSubset, + Comparator& comparator, uint32 numExamplesWithNonZeroWeights, + uint32 minCoverage, Refinement& refinement) { + // Mark all examples corresponding to the first bin with index `i < sparseBinIndex` as covered... + BinnedFeatureVector::threshold_const_iterator thresholdIterator = featureVector.thresholds_cbegin(); + uint32 numBins = featureVector.numBins; + int32 sparseBinIndex = featureVector.sparseBinIndex; + uint32 numCovered = 0; + int64 i = 0; + + if (i < sparseBinIndex) { + numCovered += addAllToSubset(statisticsSubset, featureVector, i); + } + + // Traverse bins with indices `i < sparseBinIndex` in ascending order... + if (numCovered > 0) { + for (i = i + 1; i < sparseBinIndex; i++) { + // Check if a condition using the <= operator covers at least `minCoverage` examples... + if (numCovered >= minCoverage) { + // Determine the best prediction for the examples covered by a condition using the <= operator... + const IScoreVector& scoreVector = statisticsSubset.calculateScores(); + + // Check if the quality of the prediction is better than the quality of the current rule... + if (comparator.isImprovement(scoreVector)) { + refinement.start = 0; + refinement.end = i; + refinement.inverse = false; + refinement.numCovered = numCovered; + refinement.comparator = NUMERICAL_LEQ; + refinement.threshold = thresholdIterator[i - 1]; + comparator.pushRefinement(refinement, scoreVector); + } + } + + // Check if a condition using the > operator covers at least `minCoverage` examples... + uint32 numUncovered = numExamplesWithNonZeroWeights - numCovered; + + if (numUncovered >= minCoverage) { + // Determine the best prediction for examples covered by a condition using the > operator... + const IScoreVector& scoreVector = statisticsSubset.calculateScoresUncovered(); + + // Check if the quality of the prediction is better than the quality of the current rule... + if (comparator.isImprovement(scoreVector)) { + refinement.start = 0; + refinement.end = i; + refinement.inverse = true; + refinement.numCovered = numUncovered; + refinement.comparator = NUMERICAL_GR; + refinement.threshold = thresholdIterator[i - 1]; + comparator.pushRefinement(refinement, scoreVector); + } + } + + // Mark all examples corresponding to the current bin as covered... + numCovered += addAllToSubset(statisticsSubset, featureVector, i); + } + + // Reset the subset, if any bins with indices `i < sparseBinIndex` have been processed... + statisticsSubset.resetSubset(); + } + + // Mark all examples corresponding to the last bin with index `i > sparseBinIndex` as covered... + uint32 numCoveredLessThanSparseBinIndex = numCovered; + numCovered = 0; + i = numBins - 1; + + if (i > sparseBinIndex) { + numCovered += addAllToSubset(statisticsSubset, featureVector, i); + } + + // Traverse bin with indices `i > sparseBinIndex` in descending order... + if (numCovered > 0) { + for (i = i - 1; i > sparseBinIndex; i--) { + // Check if a condition using the > operator covers at least `minCoverage` examples... + if (numCovered >= minCoverage) { + // Determine the best prediction for the covered examples... + const IScoreVector& scoreVector = statisticsSubset.calculateScores(); + + // Check if the quality of the prediction is better than the quality of the current rule... + if (comparator.isImprovement(scoreVector)) { + refinement.start = i + 1; + refinement.end = numBins; + refinement.inverse = false; + refinement.numCovered = numCovered; + refinement.comparator = NUMERICAL_GR; + refinement.threshold = thresholdIterator[i]; + comparator.pushRefinement(refinement, scoreVector); + } + } + + // Check if a condition using the <= operator covers at least `minCoverage` examples... + uint32 numUncovered = numExamplesWithNonZeroWeights - numCovered; + + if (numUncovered >= minCoverage) { + // Determine the best prediction for the covered examples... + const IScoreVector& scoreVector = statisticsSubset.calculateScoresUncovered(); + + // Check if the quality of the prediction is better than the quality of the current rule... + if (comparator.isImprovement(scoreVector)) { + refinement.start = i + 1; + refinement.end = numBins; + refinement.inverse = true; + refinement.numCovered = numUncovered; + refinement.comparator = NUMERICAL_LEQ; + refinement.threshold = thresholdIterator[i]; + comparator.pushRefinement(refinement, scoreVector); + } + } + + // Mark all examples corresponding to the current bin as covered... + numCovered += addAllToSubset(statisticsSubset, featureVector, i); + } + } + + // Check if a condition that covers all bins with indices `i > sparseBinIndex` covers at least `minCoverage` + // examples... + if (numCovered >= minCoverage) { + // Determine the best prediction for examples covered by the condition... + const IScoreVector& scoreVector = statisticsSubset.calculateScores(); + + // Check if the quality of the prediction is better than the quality of the current rule... + if (comparator.isImprovement(scoreVector)) { + refinement.start = sparseBinIndex + 1; + refinement.end = numBins; + refinement.numCovered = numCovered; + refinement.inverse = false; + refinement.comparator = NUMERICAL_GR; + refinement.threshold = thresholdIterator[sparseBinIndex]; + comparator.pushRefinement(refinement, scoreVector); + } + } + + // Check if a condition that covers all bins with indices `i <= sparseBinIndex` covers at least `minCoverage` + // examples... + uint32 numUncovered = numExamplesWithNonZeroWeights - numCovered; + + if (numUncovered >= minCoverage) { + // Determine the best prediction for examples covered by the condition... + const IScoreVector& scoreVector = statisticsSubset.calculateScores(); + + // Check if the quality of the prediction is better than the quality of the current rule... + if (comparator.isImprovement(scoreVector)) { + refinement.start = sparseBinIndex + 1; + refinement.end = numBins; + refinement.numCovered = numUncovered; + refinement.inverse = true; + refinement.comparator = NUMERICAL_LEQ; + refinement.threshold = thresholdIterator[sparseBinIndex]; + comparator.pushRefinement(refinement, scoreVector); + } + } + + // If there have been bin with indices `i < sparseBinIndex`, we must evaluate conditions that separate the examples + // corresponding to these bins from the remaining ones... + if (numCoveredLessThanSparseBinIndex > 0 && numCoveredLessThanSparseBinIndex < numExamplesWithNonZeroWeights) { + // Check if a condition that covers all bins with indices `i < sparseBinIndex` covers at least `minCoverage` + // examples... + if (numCoveredLessThanSparseBinIndex >= minCoverage) { + // Determine the best prediction for the examples covered by the condition... + const IScoreVector& scoreVector = statisticsSubset.calculateScoresAccumulated(); + + // Check if the quality of the prediction is better than the quality of the current rule... + if (comparator.isImprovement(scoreVector)) { + refinement.start = 0; + refinement.end = sparseBinIndex; + refinement.numCovered = numCoveredLessThanSparseBinIndex; + refinement.inverse = false; + refinement.comparator = NUMERICAL_LEQ; + refinement.threshold = thresholdIterator[sparseBinIndex - 1]; + comparator.pushRefinement(refinement, scoreVector); + } + } + + // Check if a condition that covers all bins with indices `i >= sparseBinIndex` covers at least `minCoverage` + // examples... + numUncovered = numExamplesWithNonZeroWeights - numCoveredLessThanSparseBinIndex; + + if (numUncovered >= minCoverage) { + // Determine the best prediction for the examples covered by the condition... + const IScoreVector& scoreVector = statisticsSubset.calculateScoresUncoveredAccumulated(); + + // Check if the quality of the prediction is better than the quality of the current rule... + if (comparator.isImprovement(scoreVector)) { + refinement.start = 0; + refinement.end = sparseBinIndex; + refinement.numCovered = numUncovered; + refinement.inverse = true; + refinement.comparator = NUMERICAL_GR; + refinement.threshold = thresholdIterator[sparseBinIndex - 1]; + comparator.pushRefinement(refinement, scoreVector); + } + } + } +} diff --git a/cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_search_nominal_common.hpp b/cpp/subprojects/common/src/mlrl/common/rule_refinement/feature_based_search_binned_common.hpp similarity index 50% rename from cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_search_nominal_common.hpp rename to cpp/subprojects/common/src/mlrl/common/rule_refinement/feature_based_search_binned_common.hpp index 3d000840f6..962c5b7df1 100644 --- a/cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_search_nominal_common.hpp +++ b/cpp/subprojects/common/src/mlrl/common/rule_refinement/feature_based_search_binned_common.hpp @@ -3,23 +3,24 @@ */ #pragma once -#include "mlrl/common/input/feature_vector_nominal.hpp" #include "mlrl/common/statistics/statistics_subset_weighted.hpp" /** - * Adds all examples corresonding to a nominal feature value to a given `IWeightedStatisticsSubset`, if they have - * non-zero weights. + * Adds all examples corresponding to a single bin in a given feature vector to a given `IWeightedStatisticsSubset`, if + * they have non-zero weights. * + * @tparam FeatureVector The type of the feature vector * @param statisticsSubset A reference to an object of type `IWeightedStatisticsSubset` - * @param featureVector A reference to an object of type `NominalFeatureVector`´that stores the indices of the - * examples that corresond to individual feature values - * @param index The index of the nominal feature value + * @param featureVector A reference to an object of template type `FeatureVector`´that stores the indices of the + * examples that correspond to individual bins + * @param index The index of the bin * @return The number of examples with non-zero weights */ -static inline uint32 addAllToSubset(IWeightedStatisticsSubset& statisticsSubset, - const NominalFeatureVector& featureVector, uint32 index) { - NominalFeatureVector::index_const_iterator indexIterator = featureVector.indices_cbegin(index); - NominalFeatureVector::index_const_iterator indicesEnd = featureVector.indices_cend(index); +template +static inline uint32 addAllToSubset(IWeightedStatisticsSubset& statisticsSubset, const FeatureVector& featureVector, + uint32 index) { + typename FeatureVector::index_const_iterator indexIterator = featureVector.indices_cbegin(index); + typename FeatureVector::index_const_iterator indicesEnd = featureVector.indices_cend(index); uint32 numIndices = indicesEnd - indexIterator; uint32 numCovered = 0; diff --git a/cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_search_nominal.hpp b/cpp/subprojects/common/src/mlrl/common/rule_refinement/feature_based_search_nominal.hpp similarity index 97% rename from cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_search_nominal.hpp rename to cpp/subprojects/common/src/mlrl/common/rule_refinement/feature_based_search_nominal.hpp index 1d690741d7..672725e77e 100644 --- a/cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_search_nominal.hpp +++ b/cpp/subprojects/common/src/mlrl/common/rule_refinement/feature_based_search_nominal.hpp @@ -3,8 +3,9 @@ */ #pragma once +#include "feature_based_search_binned_common.hpp" +#include "mlrl/common/input/feature_vector_nominal.hpp" #include "mlrl/common/rule_refinement/refinement.hpp" -#include "rule_refinement_search_nominal_common.hpp" template static inline void searchForNominalRefinementInternally(const NominalFeatureVector& featureVector, diff --git a/cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_search_numerical.hpp b/cpp/subprojects/common/src/mlrl/common/rule_refinement/feature_based_search_numerical.hpp similarity index 100% rename from cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_search_numerical.hpp rename to cpp/subprojects/common/src/mlrl/common/rule_refinement/feature_based_search_numerical.hpp diff --git a/cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_search_ordinal.hpp b/cpp/subprojects/common/src/mlrl/common/rule_refinement/feature_based_search_ordinal.hpp similarity index 99% rename from cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_search_ordinal.hpp rename to cpp/subprojects/common/src/mlrl/common/rule_refinement/feature_based_search_ordinal.hpp index 12483268f6..c1ccc3f025 100644 --- a/cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_search_ordinal.hpp +++ b/cpp/subprojects/common/src/mlrl/common/rule_refinement/feature_based_search_ordinal.hpp @@ -3,9 +3,9 @@ */ #pragma once +#include "feature_based_search_binned_common.hpp" #include "mlrl/common/input/feature_vector_ordinal.hpp" #include "mlrl/common/rule_refinement/refinement.hpp" -#include "rule_refinement_search_nominal_common.hpp" template static inline void searchForOrdinalRefinementInternally(const OrdinalFeatureVector& featureVector, diff --git a/cpp/subprojects/common/src/mlrl/common/thresholds/thresholds_exact.cpp b/cpp/subprojects/common/src/mlrl/common/rule_refinement/feature_space_tabular.cpp similarity index 57% rename from cpp/subprojects/common/src/mlrl/common/thresholds/thresholds_exact.cpp rename to cpp/subprojects/common/src/mlrl/common/rule_refinement/feature_space_tabular.cpp index b5fac8a6fc..408e006861 100644 --- a/cpp/subprojects/common/src/mlrl/common/thresholds/thresholds_exact.cpp +++ b/cpp/subprojects/common/src/mlrl/common/rule_refinement/feature_space_tabular.cpp @@ -1,11 +1,48 @@ -#include "mlrl/common/thresholds/thresholds_exact.hpp" +#include "mlrl/common/rule_refinement/feature_space_tabular.hpp" -#include "mlrl/common/rule_refinement/rule_refinement_exact.hpp" +#include "mlrl/common/rule_refinement/rule_refinement_feature_based.hpp" #include "mlrl/common/util/openmp.hpp" -#include "thresholds_common.hpp" #include +template +static inline Quality evaluateOutOfSampleInternally(IndexIterator indexIterator, uint32 numExamples, + const WeightVector& weights, const CoverageMask& coverageMask, + const IStatistics& statistics, const IPrediction& prediction) { + OutOfSampleWeightVector outOfSampleWeights(weights); + std::unique_ptr statisticsSubsetPtr = + prediction.createStatisticsSubset(statistics, outOfSampleWeights); + + for (uint32 i = 0; i < numExamples; i++) { + uint32 exampleIndex = indexIterator[i]; + + if (statisticsSubsetPtr->hasNonZeroWeight(exampleIndex) && coverageMask[exampleIndex]) { + statisticsSubsetPtr->addToSubset(exampleIndex); + } + } + + return statisticsSubsetPtr->calculateScores(); +} + +template +static inline void recalculatePredictionInternally(IndexIterator indexIterator, uint32 numExamples, + const CoverageMask& coverageMask, const IStatistics& statistics, + IPrediction& prediction) { + EqualWeightVector weights(numExamples); + std::unique_ptr statisticsSubsetPtr = prediction.createStatisticsSubset(statistics, weights); + + for (uint32 i = 0; i < numExamples; i++) { + uint32 exampleIndex = indexIterator[i]; + + if (coverageMask[exampleIndex]) { + statisticsSubsetPtr->addToSubset(exampleIndex); + } + } + + const IScoreVector& scoreVector = statisticsSubsetPtr->calculateScores(); + scoreVector.updatePrediction(prediction); +} + /** * An entry that is stored in a cache and contains an unique pointer to a feature vector. The field `numConditions` * specifies how many conditions the rule contained when the vector was updated for the last time. It may be used to @@ -28,29 +65,29 @@ struct FilteredCacheEntry final { }; /** - * Provides access to all thresholds that result from the feature values of the training examples. + * Provides access to a tabular feature space. */ -class ExactThresholds final : public AbstractThresholds { +class TabularFeatureSpace final : public IFeatureSpace { private: /** - * Provides access to a subset of the thresholds that are stored by an instance of the class `ExactThresholds`. + * Provides access to a subset of a `TabularFeatureSpace`. * * @tparam WeightVector The type of the vector that provides access to the weights of individual training * examples */ template - class ThresholdsSubset final : public IThresholdsSubset { + class FeatureSubspace final : public IFeatureSubspace { private: /** * A callback that allows to retrieve feature vectors. If available, the feature vectors are retrieved * from the cache. Otherwise, they are fetched from the feature matrix. */ - class Callback final : public IRuleRefinementCallback { + class Callback final : public IRuleRefinement::ICallback { private: - ThresholdsSubset& thresholdsSubset_; + FeatureSubspace& featureSubspace_; const IFeatureInfo& featureInfo_; @@ -59,53 +96,52 @@ class ExactThresholds final : public AbstractThresholds { public: /** - * @param thresholdsSubset A reference to an object of type `ThresholdsSubset` that caches the + * @param featureSubspace A reference to an object of type `FeatureSubspace` that caches the * feature vectors * @param featureInfo A reference to an object of type `IFeatureInfo` that provides * information about the types of individual features * @param featureIndex The index of the feature for which the feature vector should be * retrieved */ - Callback(ThresholdsSubset& thresholdsSubset, const IFeatureInfo& featureInfo, - uint32 featureIndex) - : thresholdsSubset_(thresholdsSubset), featureInfo_(featureInfo), + Callback(FeatureSubspace& featureSubspace, const IFeatureInfo& featureInfo, uint32 featureIndex) + : featureSubspace_(featureSubspace), featureInfo_(featureInfo), featureIndex_(featureIndex) {} Result get() override { - auto cacheFilteredIterator = thresholdsSubset_.cacheFiltered_.find(featureIndex_); + auto cacheFilteredIterator = featureSubspace_.cacheFiltered_.find(featureIndex_); FilteredCacheEntry& cacheEntry = cacheFilteredIterator->second; IFeatureVector* featureVector = cacheEntry.vectorPtr.get(); if (!featureVector) { - auto cacheIterator = thresholdsSubset_.thresholds_.cache_.find(featureIndex_); + auto cacheIterator = featureSubspace_.featureSpace_.cache_.find(featureIndex_); featureVector = cacheIterator->second.get(); if (!featureVector) { - std::unique_ptr featureTypePtr = - featureInfo_.createFeatureType(featureIndex_); + std::unique_ptr featureTypePtr = featureInfo_.createFeatureType( + featureIndex_, featureSubspace_.featureSpace_.featureBinningFactory_); cacheIterator->second = - thresholdsSubset_.thresholds_.featureMatrix_.createFeatureVector(featureIndex_, - *featureTypePtr); + featureSubspace_.featureSpace_.featureMatrix_.createFeatureVector( + featureIndex_, *featureTypePtr); featureVector = cacheIterator->second.get(); } } // Filter feature vector, if only a subset of its elements are covered by the current // rule... - uint32 numConditions = thresholdsSubset_.numModifications_; + uint32 numConditions = featureSubspace_.numModifications_; if (numConditions > cacheEntry.numConditions) { cacheEntry.vectorPtr = featureVector->createFilteredFeatureVector( - cacheEntry.vectorPtr, thresholdsSubset_.coverageMask_); + cacheEntry.vectorPtr, featureSubspace_.coverageMask_); cacheEntry.numConditions = numConditions; featureVector = cacheEntry.vectorPtr.get(); } - return Result(*thresholdsSubset_.weightedStatisticsPtr_, *featureVector); + return Result(*featureSubspace_.weightedStatisticsPtr_, *featureVector); } }; - ExactThresholds& thresholds_; + TabularFeatureSpace& featureSpace_; std::unique_ptr weightedStatisticsPtr_; @@ -120,8 +156,8 @@ class ExactThresholds final : public AbstractThresholds { std::unordered_map cacheFiltered_; template - std::unique_ptr createExactRuleRefinement(const IndexVector& labelIndices, - uint32 featureIndex) { + std::unique_ptr createRuleRefinementInternally(const IndexVector& labelIndices, + uint32 featureIndex) { // Retrieve the `FilteredCacheEntry` from the cache, or insert a new one if it does not already // exist... auto cacheFilteredIterator = cacheFiltered_.emplace(featureIndex, FilteredCacheEntry()).first; @@ -130,57 +166,54 @@ class ExactThresholds final : public AbstractThresholds { // If the `FilteredCacheEntry` in the cache does not refer to an `IFeatureVector`, add an empty // `unique_ptr` to the cache... if (!featureVector) { - thresholds_.cache_.emplace(featureIndex, std::unique_ptr()); + featureSpace_.cache_.emplace(featureIndex, std::unique_ptr()); } std::unique_ptr callbackPtr = - std::make_unique(*this, thresholds_.featureInfo_, featureIndex); - return std::make_unique>(labelIndices, featureIndex, numCovered_, - std::move(callbackPtr)); + std::make_unique(*this, featureSpace_.featureInfo_, featureIndex); + return std::make_unique>( + labelIndices, featureIndex, numCovered_, std::move(callbackPtr)); } public: /** - * @param thresholds A reference to an object of type `ExactThresholds` that stores the - * thresholds + * @param featureSpace A reference to an object of type `TabularFeatureSpace`, the subspace has + * been created from * @param weightedStatisticsPtr An unique pointer to an object of type `IWeightedStatistics` that * provides access to the statistics * @param weights A reference to an object of template type `WeightVector` that provides * access to the weights of individual training examples */ - ThresholdsSubset(ExactThresholds& thresholds, - std::unique_ptr weightedStatisticsPtr, - const WeightVector& weights) - : thresholds_(thresholds), weightedStatisticsPtr_(std::move(weightedStatisticsPtr)), + FeatureSubspace(TabularFeatureSpace& featureSpace, + std::unique_ptr weightedStatisticsPtr, const WeightVector& weights) + : featureSpace_(featureSpace), weightedStatisticsPtr_(std::move(weightedStatisticsPtr)), weights_(weights), numCovered_(weights.getNumNonZeroWeights()), - coverageMask_(thresholds.featureMatrix_.getNumExamples()), numModifications_(0) {} + coverageMask_(featureSpace.featureMatrix_.getNumExamples()), numModifications_(0) {} /** - * @param thresholdsSubset A reference to an object of type `ThresholdsSubset` to be copied + * @param other A reference to an object of type `FeatureSubspace` to be copied */ - ThresholdsSubset(const ThresholdsSubset& thresholdsSubset) - : thresholds_(thresholdsSubset.thresholds_), - weightedStatisticsPtr_(thresholdsSubset.weightedStatisticsPtr_->copy()), - weights_(thresholdsSubset.weights_), numCovered_(thresholdsSubset.numCovered_), - coverageMask_(thresholdsSubset.coverageMask_), - numModifications_(thresholdsSubset.numModifications_) {} - - std::unique_ptr copy() const override { - return std::make_unique>(*this); + FeatureSubspace(const FeatureSubspace& other) + : featureSpace_(other.featureSpace_), weightedStatisticsPtr_(other.weightedStatisticsPtr_->copy()), + weights_(other.weights_), numCovered_(other.numCovered_), coverageMask_(other.coverageMask_), + numModifications_(other.numModifications_) {} + + std::unique_ptr copy() const override { + return std::make_unique>(*this); } std::unique_ptr createRuleRefinement(const CompleteIndexVector& labelIndices, uint32 featureIndex) override { - return createExactRuleRefinement(labelIndices, featureIndex); + return createRuleRefinementInternally(labelIndices, featureIndex); } std::unique_ptr createRuleRefinement(const PartialIndexVector& labelIndices, uint32 featureIndex) override { - return createExactRuleRefinement(labelIndices, featureIndex); + return createRuleRefinementInternally(labelIndices, featureIndex); } - void filterThresholds(const Condition& condition) override { + void filterSubspace(const Condition& condition) override { uint32 featureIndex = condition.featureIndex; auto cacheFilteredIterator = cacheFiltered_.emplace(featureIndex, FilteredCacheEntry()).first; FilteredCacheEntry& cacheEntry = cacheFilteredIterator->second; @@ -188,7 +221,7 @@ class ExactThresholds final : public AbstractThresholds { if (!featureVector) { auto cacheIterator = - thresholds_.cache_.emplace(featureIndex, std::unique_ptr()).first; + featureSpace_.cache_.emplace(featureIndex, std::unique_ptr()).first; featureVector = cacheIterator->second.get(); } @@ -208,70 +241,47 @@ class ExactThresholds final : public AbstractThresholds { cacheEntry.numConditions = numModifications_; } - void resetThresholds() override { + void resetSubspace() override { numModifications_ = 0; numCovered_ = weights_.getNumNonZeroWeights(); cacheFiltered_.clear(); coverageMask_.reset(); } - const ICoverageState& getCoverageState() const override { + const CoverageMask& getCoverageMask() const override { return coverageMask_; } - Quality evaluateOutOfSample(const SinglePartition& partition, const CoverageMask& coverageState, + Quality evaluateOutOfSample(const SinglePartition& partition, const CoverageMask& coverageMask, const IPrediction& head) const override { return evaluateOutOfSampleInternally( - partition.cbegin(), partition.getNumElements(), weights_, coverageState, - thresholds_.statisticsProvider_.get(), head); + partition.cbegin(), partition.getNumElements(), weights_, coverageMask, + featureSpace_.statisticsProvider_.get(), head); } - Quality evaluateOutOfSample(const BiPartition& partition, const CoverageMask& coverageState, + Quality evaluateOutOfSample(const BiPartition& partition, const CoverageMask& coverageMask, const IPrediction& head) const override { return evaluateOutOfSampleInternally( - partition.first_cbegin(), partition.getNumFirst(), weights_, coverageState, - thresholds_.statisticsProvider_.get(), head); - } - - Quality evaluateOutOfSample(const SinglePartition& partition, const CoverageSet& coverageState, - const IPrediction& head) const override { - return evaluateOutOfSampleInternally(weights_, coverageState, thresholds_.statisticsProvider_.get(), - head); - } - - Quality evaluateOutOfSample(BiPartition& partition, const CoverageSet& coverageState, - const IPrediction& head) const override { - return evaluateOutOfSampleInternally(weights_, coverageState, partition, - thresholds_.statisticsProvider_.get(), head); + partition.first_cbegin(), partition.getNumFirst(), weights_, coverageMask, + featureSpace_.statisticsProvider_.get(), head); } - void recalculatePrediction(const SinglePartition& partition, const CoverageMask& coverageState, + void recalculatePrediction(const SinglePartition& partition, const CoverageMask& coverageMask, IPrediction& head) const override { recalculatePredictionInternally( - partition.cbegin(), partition.getNumElements(), coverageState, - thresholds_.statisticsProvider_.get(), head); + partition.cbegin(), partition.getNumElements(), coverageMask, + featureSpace_.statisticsProvider_.get(), head); } - void recalculatePrediction(const BiPartition& partition, const CoverageMask& coverageState, + void recalculatePrediction(const BiPartition& partition, const CoverageMask& coverageMask, IPrediction& head) const override { recalculatePredictionInternally( - partition.first_cbegin(), partition.getNumFirst(), coverageState, - thresholds_.statisticsProvider_.get(), head); - } - - void recalculatePrediction(const SinglePartition& partition, const CoverageSet& coverageState, - IPrediction& head) const override { - recalculatePredictionInternally(coverageState, thresholds_.statisticsProvider_.get(), head); - } - - void recalculatePrediction(BiPartition& partition, const CoverageSet& coverageState, - IPrediction& head) const override { - recalculatePredictionInternally(coverageState, partition, thresholds_.statisticsProvider_.get(), - head); + partition.first_cbegin(), partition.getNumFirst(), coverageMask, + featureSpace_.statisticsProvider_.get(), head); } void applyPrediction(const IPrediction& prediction) override { - IStatistics& statistics = thresholds_.statisticsProvider_.get(); + IStatistics& statistics = featureSpace_.statisticsProvider_.get(); uint32 numStatistics = statistics.getNumStatistics(); const CoverageMask* coverageMaskPtr = &coverageMask_; const IPrediction* predictionPtr = &prediction; @@ -279,17 +289,17 @@ class ExactThresholds final : public AbstractThresholds { #if MULTI_THREADING_SUPPORT_ENABLED #pragma omp parallel for firstprivate(numStatistics) firstprivate(coverageMaskPtr) firstprivate(predictionPtr) \ - firstprivate(statisticsPtr) schedule(dynamic) num_threads(thresholds_.numThreads_) + firstprivate(statisticsPtr) schedule(dynamic) num_threads(featureSpace_.numThreads_) #endif for (int64 i = 0; i < numStatistics; i++) { - if (coverageMaskPtr->isCovered(i)) { + if ((*coverageMaskPtr)[i]) { predictionPtr->apply(*statisticsPtr, i); } } } void revertPrediction(const IPrediction& prediction) override { - IStatistics& statistics = thresholds_.statisticsProvider_.get(); + IStatistics& statistics = featureSpace_.statisticsProvider_.get(); uint32 numStatistics = statistics.getNumStatistics(); const CoverageMask* coverageMaskPtr = &coverageMask_; const IPrediction* predictionPtr = &prediction; @@ -297,16 +307,24 @@ class ExactThresholds final : public AbstractThresholds { #if MULTI_THREADING_SUPPORT_ENABLED #pragma omp parallel for firstprivate(numStatistics) firstprivate(coverageMaskPtr) firstprivate(predictionPtr) \ - firstprivate(statisticsPtr) schedule(dynamic) num_threads(thresholds_.numThreads_) + firstprivate(statisticsPtr) schedule(dynamic) num_threads(featureSpace_.numThreads_) #endif for (int64 i = 0; i < numStatistics; i++) { - if (coverageMaskPtr->isCovered(i)) { + if ((*coverageMaskPtr)[i]) { predictionPtr->revert(*statisticsPtr, i); } } } }; + const IColumnWiseFeatureMatrix& featureMatrix_; + + const IFeatureInfo& featureInfo_; + + IStatisticsProvider& statisticsProvider_; + + const IFeatureBinningFactory& featureBinningFactory_; + const uint32 numThreads_; std::unordered_map> cache_; @@ -320,38 +338,51 @@ class ExactThresholds final : public AbstractThresholds { * the types of individual features * @param statisticsProvider A reference to an object of type `IStatisticsProvider` that provides access to * statistics about the labels of the training examples + * @param featureBinningFactory A reference to an object of type `IFeatureBinningFactory` that allows to create + * implementations of the binning method to be used for assigning numerical feature + * values to bins + * assign nominal feature values to bins * @param numThreads The number of CPU threads to be used to update statistics in parallel */ - ExactThresholds(const IColumnWiseFeatureMatrix& featureMatrix, const IFeatureInfo& featureInfo, - IStatisticsProvider& statisticsProvider, uint32 numThreads) - : AbstractThresholds(featureMatrix, featureInfo, statisticsProvider), numThreads_(numThreads) {} + TabularFeatureSpace(const IColumnWiseFeatureMatrix& featureMatrix, const IFeatureInfo& featureInfo, + IStatisticsProvider& statisticsProvider, + const IFeatureBinningFactory& featureBinningFactory, uint32 numThreads) + : featureMatrix_(featureMatrix), featureInfo_(featureInfo), statisticsProvider_(statisticsProvider), + featureBinningFactory_(featureBinningFactory), numThreads_(numThreads) {} + + IStatisticsProvider& getStatisticsProvider() const override final { + return statisticsProvider_; + } - std::unique_ptr createSubset(const EqualWeightVector& weights) override { + std::unique_ptr createSubspace(const EqualWeightVector& weights) override { IStatistics& statistics = statisticsProvider_.get(); std::unique_ptr weightedStatisticsPtr = statistics.createWeightedStatistics(weights); - return std::make_unique>( + return std::make_unique>( *this, std::move(weightedStatisticsPtr), weights); } - std::unique_ptr createSubset(const BitWeightVector& weights) override { + std::unique_ptr createSubspace(const BitWeightVector& weights) override { IStatistics& statistics = statisticsProvider_.get(); std::unique_ptr weightedStatisticsPtr = statistics.createWeightedStatistics(weights); - return std::make_unique>( + return std::make_unique>( *this, std::move(weightedStatisticsPtr), weights); } - std::unique_ptr createSubset(const DenseWeightVector& weights) override { + std::unique_ptr createSubspace(const DenseWeightVector& weights) override { IStatistics& statistics = statisticsProvider_.get(); std::unique_ptr weightedStatisticsPtr = statistics.createWeightedStatistics(weights); - return std::make_unique>>( + return std::make_unique>>( *this, std::move(weightedStatisticsPtr), weights); } }; -ExactThresholdsFactory::ExactThresholdsFactory(uint32 numThreads) : numThreads_(numThreads) {} +TabularFeatureSpaceFactory::TabularFeatureSpaceFactory(std::unique_ptr featureBinningFactoryPtr, + uint32 numThreads) + : featureBinningFactoryPtr_(std::move(featureBinningFactoryPtr)), numThreads_(numThreads) {} -std::unique_ptr ExactThresholdsFactory::create(const IColumnWiseFeatureMatrix& featureMatrix, - const IFeatureInfo& featureInfo, - IStatisticsProvider& statisticsProvider) const { - return std::make_unique(featureMatrix, featureInfo, statisticsProvider, numThreads_); +std::unique_ptr TabularFeatureSpaceFactory::create(const IColumnWiseFeatureMatrix& featureMatrix, + const IFeatureInfo& featureInfo, + IStatisticsProvider& statisticsProvider) const { + return std::make_unique(featureMatrix, featureInfo, statisticsProvider, + *featureBinningFactoryPtr_, numThreads_); } diff --git a/cpp/subprojects/common/src/mlrl/common/rule_refinement/prediction_complete.cpp b/cpp/subprojects/common/src/mlrl/common/rule_refinement/prediction_complete.cpp index 1b3687461a..ba8efb993f 100644 --- a/cpp/subprojects/common/src/mlrl/common/rule_refinement/prediction_complete.cpp +++ b/cpp/subprojects/common/src/mlrl/common/rule_refinement/prediction_complete.cpp @@ -88,9 +88,9 @@ std::unique_ptr CompletePrediction::createStatisticsSubset( return statistics.createSubset(indexVector_, weights); } -std::unique_ptr CompletePrediction::createRuleRefinement(IThresholdsSubset& thresholdsSubset, +std::unique_ptr CompletePrediction::createRuleRefinement(IFeatureSubspace& featureSubspace, uint32 featureIndex) const { - return indexVector_.createRuleRefinement(thresholdsSubset, featureIndex); + return indexVector_.createRuleRefinement(featureSubspace, featureIndex); } void CompletePrediction::apply(IStatistics& statistics, uint32 statisticIndex) const { diff --git a/cpp/subprojects/common/src/mlrl/common/rule_refinement/prediction_partial.cpp b/cpp/subprojects/common/src/mlrl/common/rule_refinement/prediction_partial.cpp index 69ed32b8d9..91002968a6 100644 --- a/cpp/subprojects/common/src/mlrl/common/rule_refinement/prediction_partial.cpp +++ b/cpp/subprojects/common/src/mlrl/common/rule_refinement/prediction_partial.cpp @@ -134,9 +134,9 @@ std::unique_ptr PartialPrediction::createStatisticsSubset( return statistics.createSubset(indexVector_, weights); } -std::unique_ptr PartialPrediction::createRuleRefinement(IThresholdsSubset& thresholdsSubset, +std::unique_ptr PartialPrediction::createRuleRefinement(IFeatureSubspace& featureSubspace, uint32 featureIndex) const { - return indexVector_.createRuleRefinement(thresholdsSubset, featureIndex); + return indexVector_.createRuleRefinement(featureSubspace, featureIndex); } void PartialPrediction::apply(IStatistics& statistics, uint32 statisticIndex) const { diff --git a/cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_approximate.cpp b/cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_approximate.cpp deleted file mode 100644 index ccd39f14e7..0000000000 --- a/cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_approximate.cpp +++ /dev/null @@ -1,408 +0,0 @@ -#include "mlrl/common/rule_refinement/rule_refinement_approximate.hpp" - -template -static inline void findRefinementInternally(const IndexVector& labelIndices, uint32 numExamples, uint32 featureIndex, - bool ordinal, bool nominal, uint32 minCoverage, - IRuleRefinementCallback& callback, - RefinementComparator& comparator) { - Refinement refinement; - refinement.featureIndex = featureIndex; - - // Invoke the callback... - IRuleRefinementCallback::Result callbackResult = callback.get(); - const IHistogram& statistics = callbackResult.statistics; - const ThresholdVector& thresholdVector = callbackResult.vector; - ThresholdVector::const_iterator thresholdIterator = thresholdVector.cbegin(); - uint32 numBins = thresholdVector.getNumElements(); - uint32 sparseBinIndex = thresholdVector.getSparseBinIndex(); - bool sparse = sparseBinIndex < numBins; - - // Create a new, empty subset of the statistics... - std::unique_ptr statisticsSubsetPtr = statistics.createSubset(labelIndices); - - for (auto it = thresholdVector.missing_indices_cbegin(); it != thresholdVector.missing_indices_cend(); it++) { - uint32 i = *it; - statisticsSubsetPtr->addToMissing(i); - } - - // In the following, we start by processing the bins in range [0, sparseBinIndex)... - uint32 numCovered = 0; - int64 firstR = 0; - int64 r; - - // Traverse bins in ascending order until the first bin with non-zero weight is encountered... - for (r = 0; r < sparseBinIndex; r++) { - uint32 weight = statistics.getBinWeight(r); - - if (weight > 0) { - // Add the bin to the subset to mark it as covered by upcoming refinements... - statisticsSubsetPtr->addToSubset(r); - numCovered += weight; - break; - } - } - - uint32 numAccumulated = numCovered; - - // Traverse the remaining bins in ascending order... - if (numCovered > 0) { - for (r = r + 1; r < sparseBinIndex; r++) { - uint32 weight = statistics.getBinWeight(r); - - // Do only consider bins that are not empty... - if (weight > 0) { - // Check if a condition that uses the <= operator (or the == operator in case of a nominal feature) - // covers at least `minCoverage` examples... - if (numCovered >= minCoverage) { - // Determine the best prediction for the covered examples... - const IScoreVector& scoreVector = statisticsSubsetPtr->calculateScores(); - - // Check if the quality of the prediction is better than the quality of the current rule... - if (comparator.isImprovement(scoreVector)) { - refinement.start = firstR; - refinement.end = r; - refinement.inverse = false; - refinement.numCovered = numCovered; - - if (nominal) { - refinement.threshold = (int32) thresholdIterator[r - 1]; - refinement.comparator = NOMINAL_EQ; - } else { - refinement.threshold = - ordinal ? (int32) thresholdIterator[r - 1] : thresholdIterator[r - 1]; - refinement.comparator = ordinal ? ORDINAL_LEQ : NUMERICAL_LEQ; - } - - comparator.pushRefinement(refinement, scoreVector); - } - } - - // Check if a condition that uses the > operator (or the != operator in case of a nominal feature) - // covers at least `minCoverage` examples... - uint32 coverage = numExamples - numCovered; - - if (coverage >= minCoverage) { - // Determine the best prediction for the covered examples... - const IScoreVector& scoreVector = statisticsSubsetPtr->calculateScoresUncovered(); - - // Check if the quality of the prediction is better than the quality of the current rule... - if (comparator.isImprovement(scoreVector)) { - refinement.start = firstR; - refinement.end = r; - refinement.inverse = true; - refinement.numCovered = coverage; - - if (nominal) { - refinement.threshold = (int32) thresholdIterator[r - 1]; - refinement.comparator = NOMINAL_NEQ; - } else { - refinement.threshold = - ordinal ? (int32) thresholdIterator[r - 1] : thresholdIterator[r - 1]; - refinement.comparator = ordinal ? ORDINAL_GR : NUMERICAL_GR; - } - - comparator.pushRefinement(refinement, scoreVector); - } - } - - // Reset the subset in case of a nominal feature, as the previous bins will not be covered by the next - // condition... - if (nominal) { - statisticsSubsetPtr->resetSubset(); - numCovered = 0; - firstR = r; - } - - // Add the bin to the subset to mark it as covered by upcoming refinements... - statisticsSubsetPtr->addToSubset(r); - numCovered += weight; - numAccumulated += weight; - } - } - - // If any bins have been processed so far and if there is a sparse bin, we must evaluate additional conditions - // that separate the bins that have been iterated from the remaining ones (including the sparse bin)... - if (numCovered > 0 && sparse) { - // Check if a condition that uses the <= operator (or the == operator in case of a nominal feature) covers - // at least `minCoverage` examples... - if (numCovered >= minCoverage) { - // Determine the best prediction for the covered examples... - const IScoreVector& scoreVector = statisticsSubsetPtr->calculateScores(); - - // Check if the quality of the prediction is better than the quality of the current rule... - if (comparator.isImprovement(scoreVector)) { - refinement.start = firstR; - refinement.end = sparseBinIndex; - refinement.inverse = false; - refinement.numCovered = numCovered; - - if (nominal) { - refinement.threshold = (int32) thresholdIterator[sparseBinIndex - 1]; - refinement.comparator = NOMINAL_EQ; - } else { - refinement.threshold = ordinal ? (int32) thresholdIterator[sparseBinIndex - 1] - : thresholdIterator[sparseBinIndex - 1]; - refinement.comparator = ordinal ? ORDINAL_LEQ : NUMERICAL_LEQ; - } - - comparator.pushRefinement(refinement, scoreVector); - } - } - - // Check if a condition that uses the > operator (or the != operator in case of a nominal feature) covers at - // least `minCoverage` examples... - uint32 coverage = numExamples - numCovered; - - if (coverage >= minCoverage) { - // Determine the best prediction for the covered examples... - const IScoreVector& scoreVector = statisticsSubsetPtr->calculateScoresUncovered(); - - // Check if the quality of the prediction is better than the quality of the current rule... - if (comparator.isImprovement(scoreVector)) { - refinement.start = firstR; - refinement.end = sparseBinIndex; - refinement.inverse = true; - refinement.numCovered = coverage; - - if (nominal) { - refinement.threshold = (int32) thresholdIterator[sparseBinIndex - 1]; - refinement.comparator = NOMINAL_NEQ; - } else { - refinement.threshold = ordinal ? (int32) thresholdIterator[sparseBinIndex - 1] - : thresholdIterator[sparseBinIndex - 1]; - refinement.comparator = ordinal ? ORDINAL_GR : NUMERICAL_GR; - } - - comparator.pushRefinement(refinement, scoreVector); - } - } - } - - // Reset the subset, if any bins have been processed... - statisticsSubsetPtr->resetSubset(); - } - - uint32 numAccumulatedPrevious = numAccumulated; - - // We continue by processing the bins in range (sparseBinIndex, numBins)... - numCovered = 0; - firstR = ((int64) numBins) - 1; - - // Traverse bins in descending order until the first bin with non-zero weight is encountered... - for (r = firstR; r > sparseBinIndex; r--) { - uint32 weight = statistics.getBinWeight(r); - - if (weight > 0) { - // Add the bin to the subset to mark it as covered by upcoming refinements... - statisticsSubsetPtr->addToSubset(r); - numCovered += weight; - break; - } - } - - numAccumulated = numCovered; - - // Traverse the remaining bins in descending order... - if (numCovered > 0) { - for (r = r - 1; r > sparseBinIndex; r--) { - uint32 weight = statistics.getBinWeight(r); - - // Do only consider bins that are not empty... - if (weight > 0) { - // Check if a condition that uses the > operator (or the == operator in case of a nominal feature) - // covers at least `minCoverage` examples... - if (numCovered >= minCoverage) { - // Determine the best prediction for the covered examples... - const IScoreVector& scoreVector = statisticsSubsetPtr->calculateScores(); - - // Check if the quality of the prediction is better than the quality of the current rule... - if (comparator.isImprovement(scoreVector)) { - refinement.start = firstR; - refinement.end = r; - refinement.inverse = false; - refinement.numCovered = numCovered; - - if (nominal) { - refinement.threshold = (int32) thresholdIterator[firstR]; - refinement.comparator = NOMINAL_EQ; - } else { - refinement.threshold = ordinal ? (int32) thresholdIterator[r] : thresholdIterator[r]; - refinement.comparator = ordinal ? ORDINAL_GR : NUMERICAL_GR; - } - - comparator.pushRefinement(refinement, scoreVector); - } - } - - // Check if a condition that uses the <= operator (or the != operator in case of a nominal feature) - // covers at least `minCoverage` examples... - uint32 coverage = numExamples - numCovered; - - if (coverage >= minCoverage) { - // Determine the best prediction for the covered examples... - const IScoreVector& scoreVector = statisticsSubsetPtr->calculateScoresUncovered(); - - // Check if the quality of the prediction is better than the quality of the current rule... - if (comparator.isImprovement(scoreVector)) { - refinement.start = firstR; - refinement.end = r; - refinement.inverse = true; - refinement.numCovered = coverage; - - if (nominal) { - refinement.threshold = (int32) thresholdIterator[firstR]; - refinement.comparator = NOMINAL_NEQ; - } else { - refinement.threshold = ordinal ? (int32) thresholdIterator[r] : thresholdIterator[r]; - refinement.comparator = ordinal ? ORDINAL_LEQ : NUMERICAL_LEQ; - } - - comparator.pushRefinement(refinement, scoreVector); - } - } - - // Reset the subset in case of a nominal feature, as the previous bins will not be covered by the next - // condition... - if (nominal) { - statisticsSubsetPtr->resetSubset(); - numCovered = 0; - firstR = r; - } - - // Add the bin to the subset to mark it as covered by upcoming refinements... - statisticsSubsetPtr->addToSubset(r); - numCovered += weight; - numAccumulated += weight; - } - } - - // If there is a sparse bin, we must evaluate additional conditions that separate the bins in range - // (sparseBinIndex, numBins) from the remaining ones... - if (sparse) { - // Check if a condition that uses the > operator (or the == operator in case of a nominal feature) covers at - // least `minCoverage` examples... - if (numCovered >= minCoverage) { - // Determine the best prediction for the covered examples... - const IScoreVector& scoreVector = statisticsSubsetPtr->calculateScores(); - - // Check if the quality of the prediction is better than the quality of the current rule... - if (comparator.isImprovement(scoreVector)) { - refinement.start = firstR; - refinement.end = sparseBinIndex; - refinement.inverse = false; - refinement.numCovered = numCovered; - - if (nominal) { - refinement.threshold = (int32) thresholdIterator[firstR]; - refinement.comparator = NOMINAL_EQ; - } else { - refinement.threshold = - ordinal ? (int32) thresholdIterator[sparseBinIndex] : thresholdIterator[sparseBinIndex]; - refinement.comparator = ordinal ? ORDINAL_GR : NUMERICAL_GR; - } - - comparator.pushRefinement(refinement, scoreVector); - } - } - - // Check if a condition that uses the <= operator (or the != operator in case of a nominal feature) covers - // at least `minCoverage` examples... - uint32 coverage = numExamples - numCovered; - - if (coverage >= minCoverage) { - // Determine the best prediction for the covered examples... - const IScoreVector& scoreVector = statisticsSubsetPtr->calculateScoresUncovered(); - - // Check if the quality of the prediction is better than the quality of the current rule... - if (comparator.isImprovement(scoreVector)) { - refinement.start = firstR; - refinement.end = sparseBinIndex; - refinement.inverse = true; - refinement.numCovered = coverage; - - if (nominal) { - refinement.threshold = (int32) thresholdIterator[firstR]; - refinement.comparator = NOMINAL_NEQ; - } else { - refinement.threshold = - ordinal ? (int32) thresholdIterator[sparseBinIndex] : thresholdIterator[sparseBinIndex]; - refinement.comparator = ordinal ? ORDINAL_LEQ : NUMERICAL_LEQ; - } - - comparator.pushRefinement(refinement, scoreVector); - } - } - - // If the feature is nominal and if any bins in the range [0, sparseBinIndex) have been processed earlier, - // we must test additional conditions that separate the sparse bin from the remaining bins... - if (nominal && numAccumulatedPrevious > 0) { - // Reset the subset once again to ensure that the accumulated state includes all bins that have been - // processed so far... - statisticsSubsetPtr->resetSubset(); - - // Check if the condition `f != thresholdIterator[sparseBinIndex]` covers at least `minCoverage` - // examples... - uint32 coverage = numExamples - numAccumulated - numAccumulatedPrevious; - - if (coverage >= minCoverage) { - // Determine the best prediction for the covered examples... - const IScoreVector& scoreVector = statisticsSubsetPtr->calculateScoresAccumulated(); - - // Check if the quality of the prediction is better than the quality of the current rule... - if (comparator.isImprovement(scoreVector)) { - refinement.start = sparseBinIndex; - refinement.end = sparseBinIndex + 1; - refinement.inverse = true; - refinement.numCovered = coverage; - refinement.threshold = (int32) thresholdIterator[sparseBinIndex]; - refinement.comparator = NOMINAL_NEQ; - comparator.pushRefinement(refinement, scoreVector); - } - } - - // Check if the condition `f == thresholdIterator[sparseBinIndex]` covers at least `minCoverage` - // examples... - coverage = numAccumulated + numAccumulatedPrevious; - - if (coverage >= minCoverage) { - // Determine the best prediction for the covered examples... - const IScoreVector& scoreVector = statisticsSubsetPtr->calculateScoresUncoveredAccumulated(); - - // Check if the quality of the prediction is better than the quality of the current rule... - if (comparator.isImprovement(scoreVector)) { - refinement.start = sparseBinIndex; - refinement.end = sparseBinIndex + 1; - refinement.inverse = false; - refinement.numCovered = coverage; - refinement.threshold = (int32) thresholdIterator[sparseBinIndex]; - refinement.comparator = NOMINAL_EQ; - comparator.pushRefinement(refinement, scoreVector); - } - } - } - } - } -} - -template -ApproximateRuleRefinement::ApproximateRuleRefinement(const IndexVector& labelIndices, uint32 numExamples, - uint32 featureIndex, bool ordinal, bool nominal, - std::unique_ptr callbackPtr) - : labelIndices_(labelIndices), numExamples_(numExamples), featureIndex_(featureIndex), ordinal_(ordinal), - nominal_(nominal), callbackPtr_(std::move(callbackPtr)) {} - -template -void ApproximateRuleRefinement::findRefinement(SingleRefinementComparator& comparator, - uint32 minCoverage) { - findRefinementInternally(labelIndices_, numExamples_, featureIndex_, ordinal_, nominal_, minCoverage, *callbackPtr_, - comparator); -} - -template -void ApproximateRuleRefinement::findRefinement(FixedRefinementComparator& comparator, uint32 minCoverage) { - findRefinementInternally(labelIndices_, numExamples_, featureIndex_, ordinal_, nominal_, minCoverage, *callbackPtr_, - comparator); -} - -template class ApproximateRuleRefinement; -template class ApproximateRuleRefinement; diff --git a/cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_exact.cpp b/cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_exact.cpp deleted file mode 100644 index ae81e9a5a3..0000000000 --- a/cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_exact.cpp +++ /dev/null @@ -1,45 +0,0 @@ -#include "mlrl/common/rule_refinement/rule_refinement_exact.hpp" - -#include "mlrl/common/rule_refinement/rule_refinement_search.hpp" - -template -static inline void findRefinementInternally( - const IndexVector& labelIndices, uint32 featureIndex, uint32 numExamplesWithNonZeroWeights, - IRuleRefinementCallback& callback, Comparator& comparator, - uint32 minCoverage) { - // Invoke the callback... - IRuleRefinementCallback::Result callbackResult = callback.get(); - const IImmutableWeightedStatistics& statistics = callbackResult.statistics; - const IFeatureVector& featureVector = callbackResult.vector; - - // Create a new, empty subset of the statistics... - std::unique_ptr statisticsSubsetPtr = statistics.createSubset(labelIndices); - - RuleRefinementSearch ruleRefinementSearch; - Refinement refinement; - refinement.featureIndex = featureIndex; - featureVector.searchForRefinement(ruleRefinementSearch, *statisticsSubsetPtr, comparator, - numExamplesWithNonZeroWeights, minCoverage, refinement); -} - -template -ExactRuleRefinement::ExactRuleRefinement(const IndexVector& labelIndices, uint32 featureIndex, - uint32 numExamplesWithNonZeroWeights, - std::unique_ptr callbackPtr) - : labelIndices_(labelIndices), featureIndex_(featureIndex), - numExamplesWithNonZeroWeights_(numExamplesWithNonZeroWeights), callbackPtr_(std::move(callbackPtr)) {} - -template -void ExactRuleRefinement::findRefinement(SingleRefinementComparator& comparator, uint32 minCoverage) { - findRefinementInternally(labelIndices_, featureIndex_, numExamplesWithNonZeroWeights_, *callbackPtr_, comparator, - minCoverage); -} - -template -void ExactRuleRefinement::findRefinement(FixedRefinementComparator& comparator, uint32 minCoverage) { - findRefinementInternally(labelIndices_, featureIndex_, numExamplesWithNonZeroWeights_, *callbackPtr_, comparator, - minCoverage); -} - -template class ExactRuleRefinement; -template class ExactRuleRefinement; diff --git a/cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_feature_based.cpp b/cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_feature_based.cpp new file mode 100644 index 0000000000..e327e83313 --- /dev/null +++ b/cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_feature_based.cpp @@ -0,0 +1,46 @@ +#include "mlrl/common/rule_refinement/rule_refinement_feature_based.hpp" + +#include "mlrl/common/rule_refinement/feature_based_search.hpp" + +template +static inline void findRefinementInternally(const IndexVector& labelIndices, uint32 featureIndex, + uint32 numExamplesWithNonZeroWeights, IRuleRefinement::ICallback& callback, + Comparator& comparator, uint32 minCoverage) { + // Invoke the callback... + IRuleRefinement::ICallback::Result callbackResult = callback.get(); + const IImmutableWeightedStatistics& statistics = callbackResult.statistics; + const IFeatureVector& featureVector = callbackResult.featureVector; + + // Create a new, empty subset of the statistics... + std::unique_ptr statisticsSubsetPtr = statistics.createSubset(labelIndices); + + FeatureBasedSearch featureBasedSearch; + Refinement refinement; + refinement.featureIndex = featureIndex; + featureVector.searchForRefinement(featureBasedSearch, *statisticsSubsetPtr, comparator, + numExamplesWithNonZeroWeights, minCoverage, refinement); +} + +template +FeatureBasedRuleRefinement::FeatureBasedRuleRefinement( + const IndexVector& labelIndices, uint32 featureIndex, uint32 numExamplesWithNonZeroWeights, + std::unique_ptr callbackPtr) + : labelIndices_(labelIndices), featureIndex_(featureIndex), + numExamplesWithNonZeroWeights_(numExamplesWithNonZeroWeights), callbackPtr_(std::move(callbackPtr)) {} + +template +void FeatureBasedRuleRefinement::findRefinement(SingleRefinementComparator& comparator, + uint32 minCoverage) const { + findRefinementInternally(labelIndices_, featureIndex_, numExamplesWithNonZeroWeights_, *callbackPtr_, comparator, + minCoverage); +} + +template +void FeatureBasedRuleRefinement::findRefinement(FixedRefinementComparator& comparator, + uint32 minCoverage) const { + findRefinementInternally(labelIndices_, featureIndex_, numExamplesWithNonZeroWeights_, *callbackPtr_, comparator, + minCoverage); +} + +template class FeatureBasedRuleRefinement; +template class FeatureBasedRuleRefinement; diff --git a/cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_search.cpp b/cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_search.cpp deleted file mode 100644 index 5addb24d7a..0000000000 --- a/cpp/subprojects/common/src/mlrl/common/rule_refinement/rule_refinement_search.cpp +++ /dev/null @@ -1,102 +0,0 @@ -#include "mlrl/common/rule_refinement/rule_refinement_search.hpp" - -#include "rule_refinement_search_binary.hpp" -#include "rule_refinement_search_nominal.hpp" -#include "rule_refinement_search_numerical.hpp" -#include "rule_refinement_search_ordinal.hpp" - -static inline void addMissingStatistics(IWeightedStatisticsSubset& statisticsSubset, - const MissingFeatureVector& missingFeatureVector) { - for (auto it = missingFeatureVector.indices_cbegin(); it != missingFeatureVector.indices_cend(); it++) { - uint32 index = *it; - statisticsSubset.addToMissing(index); - } -} - -void RuleRefinementSearch::searchForNumericalRefinement(const NumericalFeatureVector& featureVector, - const MissingFeatureVector& missingFeatureVector, - IWeightedStatisticsSubset& statisticsSubset, - SingleRefinementComparator& comparator, - uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, - Refinement& refinement) const { - addMissingStatistics(statisticsSubset, missingFeatureVector); - searchForNumericalRefinementInternally(featureVector, statisticsSubset, comparator, numExamplesWithNonZeroWeights, - minCoverage, refinement); -} - -void RuleRefinementSearch::searchForNumericalRefinement(const NumericalFeatureVector& featureVector, - const MissingFeatureVector& missingFeatureVector, - IWeightedStatisticsSubset& statisticsSubset, - FixedRefinementComparator& comparator, - uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, - Refinement& refinement) const { - addMissingStatistics(statisticsSubset, missingFeatureVector); - searchForNumericalRefinementInternally(featureVector, statisticsSubset, comparator, numExamplesWithNonZeroWeights, - minCoverage, refinement); -} - -void RuleRefinementSearch::searchForNominalRefinement(const NominalFeatureVector& featureVector, - const MissingFeatureVector& missingFeatureVector, - IWeightedStatisticsSubset& statisticsSubset, - SingleRefinementComparator& comparator, - uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, - Refinement& refinement) const { - addMissingStatistics(statisticsSubset, missingFeatureVector); - searchForNominalRefinementInternally(featureVector, statisticsSubset, comparator, numExamplesWithNonZeroWeights, - minCoverage, refinement); -} - -void RuleRefinementSearch::searchForNominalRefinement(const NominalFeatureVector& featureVector, - const MissingFeatureVector& missingFeatureVector, - IWeightedStatisticsSubset& statisticsSubset, - FixedRefinementComparator& comparator, - uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, - Refinement& refinement) const { - addMissingStatistics(statisticsSubset, missingFeatureVector); - searchForNominalRefinementInternally(featureVector, statisticsSubset, comparator, numExamplesWithNonZeroWeights, - minCoverage, refinement); -} - -void RuleRefinementSearch::searchForBinaryRefinement(const BinaryFeatureVector& featureVector, - const MissingFeatureVector& missingFeatureVector, - IWeightedStatisticsSubset& statisticsSubset, - SingleRefinementComparator& comparator, - uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, - Refinement& refinement) const { - addMissingStatistics(statisticsSubset, missingFeatureVector); - searchForBinaryRefinementInternally(featureVector, statisticsSubset, comparator, numExamplesWithNonZeroWeights, - minCoverage, refinement); -} - -void RuleRefinementSearch::searchForBinaryRefinement(const BinaryFeatureVector& featureVector, - const MissingFeatureVector& missingFeatureVector, - IWeightedStatisticsSubset& statisticsSubset, - FixedRefinementComparator& comparator, - uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, - Refinement& refinement) const { - addMissingStatistics(statisticsSubset, missingFeatureVector); - searchForBinaryRefinementInternally(featureVector, statisticsSubset, comparator, numExamplesWithNonZeroWeights, - minCoverage, refinement); -} - -void RuleRefinementSearch::searchForOrdinalRefinement(const OrdinalFeatureVector& featureVector, - const MissingFeatureVector& missingFeatureVector, - IWeightedStatisticsSubset& statisticsSubset, - SingleRefinementComparator& comparator, - uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, - Refinement& refinement) const { - addMissingStatistics(statisticsSubset, missingFeatureVector); - searchForOrdinalRefinementInternally(featureVector, statisticsSubset, comparator, numExamplesWithNonZeroWeights, - minCoverage, refinement); -} - -void RuleRefinementSearch::searchForOrdinalRefinement(const OrdinalFeatureVector& featureVector, - const MissingFeatureVector& missingFeatureVector, - IWeightedStatisticsSubset& statisticsSubset, - FixedRefinementComparator& comparator, - uint32 numExamplesWithNonZeroWeights, uint32 minCoverage, - Refinement& refinement) const { - addMissingStatistics(statisticsSubset, missingFeatureVector); - searchForOrdinalRefinementInternally(featureVector, statisticsSubset, comparator, numExamplesWithNonZeroWeights, - minCoverage, refinement); -} diff --git a/cpp/subprojects/common/src/mlrl/common/sampling/partition_bi.cpp b/cpp/subprojects/common/src/mlrl/common/sampling/partition_bi.cpp index c0be442b71..706f0f2767 100644 --- a/cpp/subprojects/common/src/mlrl/common/sampling/partition_bi.cpp +++ b/cpp/subprojects/common/src/mlrl/common/sampling/partition_bi.cpp @@ -1,10 +1,10 @@ #include "mlrl/common/sampling/partition_bi.hpp" #include "mlrl/common/prediction/probability_calibration_joint.hpp" +#include "mlrl/common/rule_refinement/feature_subspace.hpp" #include "mlrl/common/rule_refinement/prediction.hpp" #include "mlrl/common/sampling/instance_sampling.hpp" #include "mlrl/common/stopping/stopping_criterion.hpp" -#include "mlrl/common/thresholds/thresholds_subset.hpp" #include @@ -76,14 +76,14 @@ std::unique_ptr BiPartition::createInstanceSampling(const IIn return labelMatrix.createInstanceSampling(factory, *this, statistics); } -Quality BiPartition::evaluateOutOfSample(const IThresholdsSubset& thresholdsSubset, const ICoverageState& coverageState, +Quality BiPartition::evaluateOutOfSample(const IFeatureSubspace& featureSubspace, const CoverageMask& coverageMask, const IPrediction& head) { - return coverageState.evaluateOutOfSample(thresholdsSubset, *this, head); + return featureSubspace.evaluateOutOfSample(*this, coverageMask, head); } -void BiPartition::recalculatePrediction(const IThresholdsSubset& thresholdsSubset, const ICoverageState& coverageState, +void BiPartition::recalculatePrediction(const IFeatureSubspace& featureSubspace, const CoverageMask& coverageMask, IPrediction& head) { - coverageState.recalculatePrediction(thresholdsSubset, *this, head); + featureSubspace.recalculatePrediction(*this, coverageMask, head); } std::unique_ptr BiPartition::fitMarginalProbabilityCalibrationModel( diff --git a/cpp/subprojects/common/src/mlrl/common/sampling/partition_single.cpp b/cpp/subprojects/common/src/mlrl/common/sampling/partition_single.cpp index 9c1c07035c..392e81bc61 100644 --- a/cpp/subprojects/common/src/mlrl/common/sampling/partition_single.cpp +++ b/cpp/subprojects/common/src/mlrl/common/sampling/partition_single.cpp @@ -1,10 +1,10 @@ #include "mlrl/common/sampling/partition_single.hpp" #include "mlrl/common/prediction/probability_calibration_joint.hpp" +#include "mlrl/common/rule_refinement/feature_subspace.hpp" #include "mlrl/common/rule_refinement/prediction.hpp" #include "mlrl/common/sampling/instance_sampling.hpp" #include "mlrl/common/stopping/stopping_criterion.hpp" -#include "mlrl/common/thresholds/thresholds_subset.hpp" SinglePartition::SinglePartition(uint32 numElements) : numElements_(numElements) {} @@ -30,14 +30,14 @@ std::unique_ptr SinglePartition::createInstanceSampling(const return labelMatrix.createInstanceSampling(factory, *this, statistics); } -Quality SinglePartition::evaluateOutOfSample(const IThresholdsSubset& thresholdsSubset, - const ICoverageState& coverageState, const IPrediction& head) { - return coverageState.evaluateOutOfSample(thresholdsSubset, *this, head); +Quality SinglePartition::evaluateOutOfSample(const IFeatureSubspace& featureSubspace, const CoverageMask& coverageMask, + const IPrediction& head) { + return featureSubspace.evaluateOutOfSample(*this, coverageMask, head); } -void SinglePartition::recalculatePrediction(const IThresholdsSubset& thresholdsSubset, - const ICoverageState& coverageState, IPrediction& head) { - coverageState.recalculatePrediction(thresholdsSubset, *this, head); +void SinglePartition::recalculatePrediction(const IFeatureSubspace& featureSubspace, const CoverageMask& coverageMask, + IPrediction& head) { + featureSubspace.recalculatePrediction(*this, coverageMask, head); } std::unique_ptr SinglePartition::fitMarginalProbabilityCalibrationModel( diff --git a/cpp/subprojects/common/src/mlrl/common/sampling/weight_vector_bit.cpp b/cpp/subprojects/common/src/mlrl/common/sampling/weight_vector_bit.cpp index 2e15008502..72f737889f 100644 --- a/cpp/subprojects/common/src/mlrl/common/sampling/weight_vector_bit.cpp +++ b/cpp/subprojects/common/src/mlrl/common/sampling/weight_vector_bit.cpp @@ -1,7 +1,7 @@ #include "mlrl/common/sampling/weight_vector_bit.hpp" -#include "mlrl/common/thresholds/thresholds.hpp" -#include "mlrl/common/thresholds/thresholds_subset.hpp" +#include "mlrl/common/rule_refinement/feature_space.hpp" +#include "mlrl/common/rule_refinement/feature_subspace.hpp" BitWeightVector::BitWeightVector(uint32 numElements, bool init) : vector_(numElements, init), numNonZeroWeights_(0) {} @@ -33,6 +33,6 @@ void BitWeightVector::clear() { vector_.clear(); } -std::unique_ptr BitWeightVector::createThresholdsSubset(IThresholds& thresholds) const { - return thresholds.createSubset(*this); +std::unique_ptr BitWeightVector::createFeatureSubspace(IFeatureSpace& featureSpace) const { + return featureSpace.createSubspace(*this); } diff --git a/cpp/subprojects/common/src/mlrl/common/sampling/weight_vector_dense.cpp b/cpp/subprojects/common/src/mlrl/common/sampling/weight_vector_dense.cpp index 6967d02b89..937a55394a 100644 --- a/cpp/subprojects/common/src/mlrl/common/sampling/weight_vector_dense.cpp +++ b/cpp/subprojects/common/src/mlrl/common/sampling/weight_vector_dense.cpp @@ -1,7 +1,7 @@ #include "mlrl/common/sampling/weight_vector_dense.hpp" -#include "mlrl/common/thresholds/thresholds.hpp" -#include "mlrl/common/thresholds/thresholds_subset.hpp" +#include "mlrl/common/rule_refinement/feature_space.hpp" +#include "mlrl/common/rule_refinement/feature_subspace.hpp" template DenseWeightVector::DenseWeightVector(uint32 numElements, bool init) @@ -23,8 +23,8 @@ bool DenseWeightVector::hasZeroWeights() const { } template -std::unique_ptr DenseWeightVector::createThresholdsSubset(IThresholds& thresholds) const { - return thresholds.createSubset(*this); +std::unique_ptr DenseWeightVector::createFeatureSubspace(IFeatureSpace& featureSpace) const { + return featureSpace.createSubspace(*this); } template class DenseWeightVector; diff --git a/cpp/subprojects/common/src/mlrl/common/sampling/weight_vector_equal.cpp b/cpp/subprojects/common/src/mlrl/common/sampling/weight_vector_equal.cpp index 3036cad419..cd6d13e4da 100644 --- a/cpp/subprojects/common/src/mlrl/common/sampling/weight_vector_equal.cpp +++ b/cpp/subprojects/common/src/mlrl/common/sampling/weight_vector_equal.cpp @@ -1,7 +1,7 @@ #include "mlrl/common/sampling/weight_vector_equal.hpp" -#include "mlrl/common/thresholds/thresholds.hpp" -#include "mlrl/common/thresholds/thresholds_subset.hpp" +#include "mlrl/common/rule_refinement/feature_space.hpp" +#include "mlrl/common/rule_refinement/feature_subspace.hpp" EqualWeightVector::EqualWeightVector(uint32 numElements) : numElements_(numElements) {} @@ -21,6 +21,6 @@ bool EqualWeightVector::hasZeroWeights() const { return false; } -std::unique_ptr EqualWeightVector::createThresholdsSubset(IThresholds& thresholds) const { - return thresholds.createSubset(*this); +std::unique_ptr EqualWeightVector::createFeatureSubspace(IFeatureSpace& featureSpace) const { + return featureSpace.createSubspace(*this); } diff --git a/cpp/subprojects/common/src/mlrl/common/thresholds/coverage_mask.cpp b/cpp/subprojects/common/src/mlrl/common/thresholds/coverage_mask.cpp deleted file mode 100644 index 388baa862e..0000000000 --- a/cpp/subprojects/common/src/mlrl/common/thresholds/coverage_mask.cpp +++ /dev/null @@ -1,54 +0,0 @@ -#include "mlrl/common/thresholds/coverage_mask.hpp" - -#include "mlrl/common/rule_refinement/prediction.hpp" -#include "mlrl/common/thresholds/thresholds_subset.hpp" - -CoverageMask::CoverageMask(uint32 numElements) - : DenseVectorDecorator>(AllocatedVector(numElements, true)), indicatorValue_(0) {} - -CoverageMask::CoverageMask(const CoverageMask& other) - : DenseVectorDecorator>(AllocatedVector(other.getNumElements())), - indicatorValue_(other.indicatorValue_) { - copyView(other.cbegin(), this->begin(), this->getNumElements()); -} - -uint32 CoverageMask::getIndicatorValue() const { - return indicatorValue_; -} - -void CoverageMask::setIndicatorValue(uint32 indicatorValue) { - indicatorValue_ = indicatorValue; -} - -void CoverageMask::reset() { - indicatorValue_ = 0; - setViewToZeros(this->begin(), this->getNumElements()); -} - -bool CoverageMask::isCovered(uint32 pos) const { - return this->view.array[pos] == indicatorValue_; -} - -std::unique_ptr CoverageMask::copy() const { - return std::make_unique(*this); -} - -Quality CoverageMask::evaluateOutOfSample(const IThresholdsSubset& thresholdsSubset, const SinglePartition& partition, - const IPrediction& head) const { - return thresholdsSubset.evaluateOutOfSample(partition, *this, head); -} - -Quality CoverageMask::evaluateOutOfSample(const IThresholdsSubset& thresholdsSubset, BiPartition& partition, - const IPrediction& head) const { - return thresholdsSubset.evaluateOutOfSample(partition, *this, head); -} - -void CoverageMask::recalculatePrediction(const IThresholdsSubset& thresholdsSubset, const SinglePartition& partition, - IPrediction& head) const { - thresholdsSubset.recalculatePrediction(partition, *this, head); -} - -void CoverageMask::recalculatePrediction(const IThresholdsSubset& thresholdsSubset, BiPartition& partition, - IPrediction& head) const { - thresholdsSubset.recalculatePrediction(partition, *this, head); -} diff --git a/cpp/subprojects/common/src/mlrl/common/thresholds/coverage_set.cpp b/cpp/subprojects/common/src/mlrl/common/thresholds/coverage_set.cpp deleted file mode 100644 index e216609395..0000000000 --- a/cpp/subprojects/common/src/mlrl/common/thresholds/coverage_set.cpp +++ /dev/null @@ -1,52 +0,0 @@ -#include "mlrl/common/thresholds/coverage_set.hpp" - -#include "mlrl/common/rule_refinement/prediction.hpp" -#include "mlrl/common/thresholds/thresholds_subset.hpp" - -CoverageSet::CoverageSet(uint32 numElements) - : DenseVectorDecorator>(AllocatedVector(numElements)), numCovered_(numElements) { - setViewToIncreasingValues(this->begin(), numElements, 0, 1); -} - -CoverageSet::CoverageSet(const CoverageSet& other) - : DenseVectorDecorator>(AllocatedVector(other.getNumElements())), - numCovered_(other.numCovered_) { - copyView(other.cbegin(), this->begin(), numCovered_); -} - -uint32 CoverageSet::getNumCovered() const { - return numCovered_; -} - -void CoverageSet::setNumCovered(uint32 numCovered) { - numCovered_ = numCovered; -} - -void CoverageSet::reset() { - numCovered_ = this->getNumElements(); - setViewToIncreasingValues(this->begin(), this->getNumElements(), 0, 1); -} - -std::unique_ptr CoverageSet::copy() const { - return std::make_unique(*this); -} - -Quality CoverageSet::evaluateOutOfSample(const IThresholdsSubset& thresholdsSubset, const SinglePartition& partition, - const IPrediction& head) const { - return thresholdsSubset.evaluateOutOfSample(partition, *this, head); -} - -Quality CoverageSet::evaluateOutOfSample(const IThresholdsSubset& thresholdsSubset, BiPartition& partition, - const IPrediction& head) const { - return thresholdsSubset.evaluateOutOfSample(partition, *this, head); -} - -void CoverageSet::recalculatePrediction(const IThresholdsSubset& thresholdsSubset, const SinglePartition& partition, - IPrediction& head) const { - thresholdsSubset.recalculatePrediction(partition, *this, head); -} - -void CoverageSet::recalculatePrediction(const IThresholdsSubset& thresholdsSubset, BiPartition& partition, - IPrediction& head) const { - thresholdsSubset.recalculatePrediction(partition, *this, head); -} diff --git a/cpp/subprojects/common/src/mlrl/common/thresholds/thresholds_approximate.cpp b/cpp/subprojects/common/src/mlrl/common/thresholds/thresholds_approximate.cpp deleted file mode 100644 index 2b34a9cace..0000000000 --- a/cpp/subprojects/common/src/mlrl/common/thresholds/thresholds_approximate.cpp +++ /dev/null @@ -1,421 +0,0 @@ -#include "mlrl/common/thresholds/thresholds_approximate.hpp" - -#include "mlrl/common/rule_refinement/rule_refinement_approximate.hpp" -#include "mlrl/common/util/openmp.hpp" -#include "thresholds_common.hpp" - -#include - -/** - * Updates a given `CoverageSet` after a new condition has been added, such that only the examples that are covered by - * the new rule are marked es covered. - * - * @param thresholdVector A reference to an object of type `ThresholdVector` that stores the thresholds that result - * from the boundaries of the bins - * @param binIndices A reference to an object of type `IBinIndexVector` that stores the indices of the bins, - * individual examples belong to - * @param conditionStart The first bin (inclusive) that is covered by the new condition - * @param conditionEnd The last bin (exclusive) that is covered by the new condition - * @param covered True, if the bins in range [conditionStart, conditionEnd) are covered by the new condition - * and the remaining ones are not, false, if the elements in said range are not covered, but - * the remaining ones are - * @param coverageSet A reference to an object of type `CoverageSet` that is used to keep track of the examples - * that are covered by the previous rule. It will be updated by this function - * @param statistics A reference to an object of type `IWeightedStatistics` to be notified about the statistics - * that must be considered when searching for the next refinement, i.e., the statistics that - * are covered by the new rule - */ -static inline void updateCoveredExamples(const ThresholdVector& thresholdVector, const IBinIndexVector& binIndices, - int64 conditionStart, int64 conditionEnd, bool covered, - CoverageSet& coverageSet, IWeightedStatistics& statistics) { - int64 start, end; - - if (conditionEnd < conditionStart) { - start = conditionEnd + 1; - end = conditionStart + 1; - } else { - start = conditionStart; - end = conditionEnd; - } - - uint32 numCovered = coverageSet.getNumCovered(); - CoverageSet::iterator coverageSetIterator = coverageSet.begin(); - statistics.resetCoveredStatistics(); - uint32 n = 0; - - for (uint32 i = 0; i < numCovered; i++) { - uint32 exampleIndex = coverageSetIterator[i]; - - if (!thresholdVector.isMissing(exampleIndex)) { - uint32 binIndex = binIndices.getBinIndex(exampleIndex); - - if (binIndex == IBinIndexVector::BIN_INDEX_SPARSE) { - binIndex = thresholdVector.getSparseBinIndex(); - } - - if ((binIndex >= start && binIndex < end) == covered) { - statistics.addCoveredStatistic(exampleIndex); - coverageSetIterator[n] = exampleIndex; - n++; - } - } - } - - coverageSet.setNumCovered(n); -} - -/** - * Rebuilds a given histogram. - * - * @param thresholdVector A reference to an object of type `ThresholdVector` that stores the thresholds that result - * from the boundaries of the bins - * @param histogram A reference to an object of type `IHistogram` that should be rebuild - * @param coverageSet A reference to an object of type `CoverageSet` that is used to keep track of the examples - * that are currently covered - */ -static inline void rebuildHistogram(const ThresholdVector& thresholdVector, IHistogram& histogram, - const CoverageSet& coverageSet) { - // Reset all statistics in the histogram to zero... - histogram.clear(); - - // Iterate the covered examples and add their statistics to the corresponding bin... - uint32 numCovered = coverageSet.getNumCovered(); - CoverageSet::const_iterator coverageSetIterator = coverageSet.cbegin(); - - for (uint32 i = 0; i < numCovered; i++) { - uint32 exampleIndex = coverageSetIterator[i]; - - if (!thresholdVector.isMissing(exampleIndex)) { - histogram.addToBin(exampleIndex); - } - } -} - -/** - * Provides access to the thresholds that result from applying a binning method to the feature values of the training - * examples. - */ -class ApproximateThresholds final : public AbstractThresholds { - private: - - /** - * Provides access to a subset of the thresholds that are stored by an instance of the class - * `ApproximateThresholds`. - * - * @tparam WeightVector The type of the vector that provides access to the weights of individual training - * examples - */ - template - class ThresholdsSubset final : public IThresholdsSubset { - private: - - /** - * A callback that allows to retrieve bins and corresponding statistics. If available, the bins and - * statistics are retrieved from the cache. Otherwise, they are computed by fetching the feature values - * from the feature matrix and applying a binning method. - */ - class Callback final : public IRuleRefinementCallback { - private: - - ThresholdsSubset& thresholdsSubset_; - - const uint32 featureIndex_; - - const bool nominal_; - - public: - - /** - * @param thresholdsSubset A reference to an object of type `ThresholdsSubset` that caches the - * bins - * @param featureIndex The index of the feature for which the bins should be retrieved - * @param nominal True, if the feature at index `featureIndex` is nominal, false - * otherwise - */ - Callback(ThresholdsSubset& thresholdsSubset, uint32 featureIndex, bool nominal) - : thresholdsSubset_(thresholdsSubset), featureIndex_(featureIndex), nominal_(nominal) {} - - Result get() override { - auto cacheIterator = thresholdsSubset_.thresholds_.cache_.find(featureIndex_); - IFeatureBinning::Result& cacheEntry = cacheIterator->second; - ThresholdVector* thresholdVector = cacheEntry.thresholdVectorPtr.get(); - IBinIndexVector* binIndices = cacheEntry.binIndicesPtr.get(); - - if (!thresholdVector) { - // Fetch feature vector... - std::unique_ptr featureVectorPtr; - const IColumnWiseFeatureMatrix& featureMatrix = - thresholdsSubset_.thresholds_.featureMatrix_; - uint32 numExamples = featureMatrix.getNumExamples(); - featureMatrix.fetchFeatureVector(featureIndex_, featureVectorPtr); - - // Apply binning method... - const IFeatureBinning& binning = - nominal_ ? *thresholdsSubset_.thresholds_.nominalFeatureBinningPtr_ - : *thresholdsSubset_.thresholds_.numericalFeatureBinningPtr_; - IFeatureBinning::Result result = binning.createBins(*featureVectorPtr, numExamples); - cacheEntry.thresholdVectorPtr = std::move(result.thresholdVectorPtr); - thresholdVector = cacheEntry.thresholdVectorPtr.get(); - cacheEntry.binIndicesPtr = std::move(result.binIndicesPtr); - binIndices = cacheEntry.binIndicesPtr.get(); - } - - auto cacheHistogramIterator = thresholdsSubset_.cacheHistogram_.find(featureIndex_); - - if (!cacheHistogramIterator->second) { - // Create histogram and weight vector... - uint32 numBins = thresholdVector->getNumElements(); - cacheHistogramIterator->second = - binIndices->createHistogram(*thresholdsSubset_.weightedStatisticsPtr_, numBins); - } - - // Rebuild histogram... - IHistogram& histogram = *cacheHistogramIterator->second; - rebuildHistogram(*thresholdVector, histogram, thresholdsSubset_.coverageSet_); - - return Result(histogram, *thresholdVector); - } - }; - - ApproximateThresholds& thresholds_; - - std::unique_ptr weightedStatisticsPtr_; - - const WeightVector& weights_; - - CoverageSet coverageSet_; - - std::unordered_map> cacheHistogram_; - - template - std::unique_ptr createApproximateRuleRefinement(const IndexVector& labelIndices, - uint32 featureIndex) { - // Retrieve `unique_ptr` from the cache, or insert an empty one if it does not already exist... - auto cacheHistogramIterator = - cacheHistogram_.emplace(featureIndex, std::unique_ptr()).first; - - // If the `unique_ptr` in the cache does not refer to an `IHistogram`, add an empty - // `IFeatureBinning::Result` to the cache... - if (!cacheHistogramIterator->second) { - thresholds_.cache_.emplace(featureIndex, IFeatureBinning::Result()); - } - - std::unique_ptr featureTypePtr = - thresholds_.featureInfo_.createFeatureType(featureIndex); - bool ordinal = featureTypePtr->isOrdinal(); - bool nominal = featureTypePtr->isNominal(); - std::unique_ptr callbackPtr = std::make_unique(*this, featureIndex, nominal); - return std::make_unique>( - labelIndices, coverageSet_.getNumCovered(), featureIndex, ordinal, nominal, - std::move(callbackPtr)); - } - - public: - - /** - * @param thresholds A reference to an object of type `ApproximateThresholds` that stores the - * thresholds - * @param weightedStatisticsPtr An unique pointer to an object of type `IWeightedStatistics` that - * provides access to the statistics - * @param weights A reference to an object of template type `WeightWeight` that provides - * access to the weights of individual training examples - */ - ThresholdsSubset(ApproximateThresholds& thresholds, - std::unique_ptr weightedStatisticsPtr, - const WeightVector& weights) - : thresholds_(thresholds), weightedStatisticsPtr_(std::move(weightedStatisticsPtr)), - weights_(weights), coverageSet_(thresholds.featureMatrix_.getNumExamples()) {} - - /** - * @param thresholdsSubset A reference to an object of type `ThresholdsSubset` to be copied - */ - ThresholdsSubset(const ThresholdsSubset& thresholdsSubset) - : thresholds_(thresholdsSubset.thresholds_), - weightedStatisticsPtr_(thresholdsSubset.weightedStatisticsPtr_->copy()), - weights_(thresholdsSubset.weights_), coverageSet_(thresholdsSubset.coverageSet_) {} - - std::unique_ptr copy() const override { - return std::make_unique>(*this); - } - - std::unique_ptr createRuleRefinement(const CompleteIndexVector& labelIndices, - uint32 featureIndex) override { - return createApproximateRuleRefinement(labelIndices, featureIndex); - } - - std::unique_ptr createRuleRefinement(const PartialIndexVector& labelIndices, - uint32 featureIndex) override { - return createApproximateRuleRefinement(labelIndices, featureIndex); - } - - void filterThresholds(const Condition& condition) override { - uint32 featureIndex = condition.featureIndex; - auto cacheIterator = thresholds_.cache_.find(featureIndex); - IFeatureBinning::Result& cacheEntry = cacheIterator->second; - const ThresholdVector& thresholdVector = *cacheEntry.thresholdVectorPtr; - const IBinIndexVector& binIndices = *cacheEntry.binIndicesPtr; - updateCoveredExamples(thresholdVector, binIndices, condition.start, condition.end, - !condition.inverse, coverageSet_, *weightedStatisticsPtr_); - } - - void resetThresholds() override { - coverageSet_.reset(); - } - - const ICoverageState& getCoverageState() const override { - return coverageSet_; - } - - Quality evaluateOutOfSample(const SinglePartition& partition, const CoverageMask& coverageState, - const IPrediction& head) const override { - return evaluateOutOfSampleInternally( - partition.cbegin(), partition.getNumElements(), weights_, coverageState, - thresholds_.statisticsProvider_.get(), head); - } - - Quality evaluateOutOfSample(const BiPartition& partition, const CoverageMask& coverageState, - const IPrediction& head) const override { - return evaluateOutOfSampleInternally( - partition.first_cbegin(), partition.getNumFirst(), weights_, coverageState, - thresholds_.statisticsProvider_.get(), head); - } - - Quality evaluateOutOfSample(const SinglePartition& partition, const CoverageSet& coverageState, - const IPrediction& head) const override { - return evaluateOutOfSampleInternally(weights_, coverageState, thresholds_.statisticsProvider_.get(), - head); - } - - Quality evaluateOutOfSample(BiPartition& partition, const CoverageSet& coverageState, - const IPrediction& head) const override { - return evaluateOutOfSampleInternally(weights_, coverageState, partition, - thresholds_.statisticsProvider_.get(), head); - } - - void recalculatePrediction(const SinglePartition& partition, const CoverageMask& coverageState, - IPrediction& head) const override { - recalculatePredictionInternally( - partition.cbegin(), partition.getNumElements(), coverageState, - thresholds_.statisticsProvider_.get(), head); - } - - void recalculatePrediction(const BiPartition& partition, const CoverageMask& coverageState, - IPrediction& head) const override { - recalculatePredictionInternally( - partition.first_cbegin(), partition.getNumFirst(), coverageState, - thresholds_.statisticsProvider_.get(), head); - } - - void recalculatePrediction(const SinglePartition& partition, const CoverageSet& coverageState, - IPrediction& head) const override { - recalculatePredictionInternally(coverageState, thresholds_.statisticsProvider_.get(), head); - } - - void recalculatePrediction(BiPartition& partition, const CoverageSet& coverageState, - IPrediction& head) const override { - recalculatePredictionInternally(coverageState, partition, thresholds_.statisticsProvider_.get(), - head); - } - - void applyPrediction(const IPrediction& prediction) override { - uint32 numCovered = coverageSet_.getNumCovered(); - CoverageSet::const_iterator iterator = coverageSet_.cbegin(); - const IPrediction* predictionPtr = &prediction; - IStatistics* statisticsPtr = &thresholds_.statisticsProvider_.get(); - -#if MULTI_THREADING_SUPPORT_ENABLED - #pragma omp parallel for firstprivate(numCovered) firstprivate(iterator) firstprivate(predictionPtr) \ - firstprivate(statisticsPtr) schedule(dynamic) num_threads(thresholds_.numThreads_) -#endif - for (int64 i = 0; i < numCovered; i++) { - uint32 exampleIndex = iterator[i]; - predictionPtr->apply(*statisticsPtr, exampleIndex); - } - } - - void revertPrediction(const IPrediction& prediction) override { - uint32 numCovered = coverageSet_.getNumCovered(); - CoverageSet::const_iterator iterator = coverageSet_.cbegin(); - const IPrediction* predictionPtr = &prediction; - IStatistics* statisticsPtr = &thresholds_.statisticsProvider_.get(); - -#if MULTI_THREADING_SUPPORT_ENABLED - #pragma omp parallel for firstprivate(numCovered) firstprivate(iterator) firstprivate(predictionPtr) \ - firstprivate(statisticsPtr) schedule(dynamic) num_threads(thresholds_.numThreads_) -#endif - for (int64 i = 0; i < numCovered; i++) { - uint32 exampleIndex = iterator[i]; - predictionPtr->revert(*statisticsPtr, exampleIndex); - } - } - }; - - const std::unique_ptr numericalFeatureBinningPtr_; - - const std::unique_ptr nominalFeatureBinningPtr_; - - const uint32 numThreads_; - - std::unordered_map cache_; - - public: - - /** - * @param featureMatrix A reference to an object of type `IColumnWiseFeatureMatrix` that - * provides column-wise access to the feature values of individual training - * examples - * @param featureInfo A reference to an object of type `IFeatureInfo` that provides - * information about the types of individual features - * @param statisticsProvider A reference to an object of type `IStatisticsProvider` that provides - * access to statistics about the labels of the training examples - * @param numericalFeatureBinningPtr An unique pointer to an object of type `IFeatureBinning` that should be - * used to assign numerical feature values to bins - * @param nominalFeatureBinningPtr An unique pointer to an object of type `IFeatureBinning` that should be - * used to assign nominal feature values to bins - * @param numThreads The number of CPU threads to be used to update statistics in parallel - */ - ApproximateThresholds(const IColumnWiseFeatureMatrix& featureMatrix, const IFeatureInfo& featureInfo, - IStatisticsProvider& statisticsProvider, - std::unique_ptr numericalFeatureBinningPtr, - std::unique_ptr nominalFeatureBinningPtr, uint32 numThreads) - : AbstractThresholds(featureMatrix, featureInfo, statisticsProvider), - numericalFeatureBinningPtr_(std::move(numericalFeatureBinningPtr)), - nominalFeatureBinningPtr_(std::move(nominalFeatureBinningPtr)), numThreads_(numThreads) {} - - std::unique_ptr createSubset(const EqualWeightVector& weights) override { - IStatistics& statistics = statisticsProvider_.get(); - std::unique_ptr weightedStatisticsPtr = statistics.createWeightedStatistics(weights); - return std::make_unique>( - *this, std::move(weightedStatisticsPtr), weights); - } - - std::unique_ptr createSubset(const BitWeightVector& weights) override { - IStatistics& statistics = statisticsProvider_.get(); - std::unique_ptr weightedStatisticsPtr = statistics.createWeightedStatistics(weights); - return std::make_unique>( - *this, std::move(weightedStatisticsPtr), weights); - } - - std::unique_ptr createSubset(const DenseWeightVector& weights) override { - IStatistics& statistics = statisticsProvider_.get(); - std::unique_ptr weightedStatisticsPtr = statistics.createWeightedStatistics(weights); - return std::make_unique>>( - *this, std::move(weightedStatisticsPtr), weights); - } -}; - -ApproximateThresholdsFactory::ApproximateThresholdsFactory( - std::unique_ptr numericalFeatureBinningFactoryPtr, - std::unique_ptr nominalFeatureBinningFactoryPtr, uint32 numThreads) - : numericalFeatureBinningFactoryPtr_(std::move(numericalFeatureBinningFactoryPtr)), - nominalFeatureBinningFactoryPtr_(std::move(nominalFeatureBinningFactoryPtr)), numThreads_(numThreads) {} - -std::unique_ptr ApproximateThresholdsFactory::create(const IColumnWiseFeatureMatrix& featureMatrix, - const IFeatureInfo& featureInfo, - IStatisticsProvider& statisticsProvider) const { - std::unique_ptr numericalFeatureBinningPtr = numericalFeatureBinningFactoryPtr_->create(); - std::unique_ptr nominalFeatureBinningPtr = nominalFeatureBinningFactoryPtr_->create(); - return std::make_unique(featureMatrix, featureInfo, statisticsProvider, - std::move(numericalFeatureBinningPtr), - std::move(nominalFeatureBinningPtr), numThreads_); -} diff --git a/cpp/subprojects/common/src/mlrl/common/thresholds/thresholds_common.hpp b/cpp/subprojects/common/src/mlrl/common/thresholds/thresholds_common.hpp deleted file mode 100644 index 745c71484b..0000000000 --- a/cpp/subprojects/common/src/mlrl/common/thresholds/thresholds_common.hpp +++ /dev/null @@ -1,183 +0,0 @@ -/* - * @author Michael Rapp (michael.rapp.ml@gmail.com) - */ -#pragma once - -#include "mlrl/common/input/feature_info.hpp" -#include "mlrl/common/input/feature_matrix.hpp" -#include "mlrl/common/iterator/binary_forward_iterator.hpp" -#include "mlrl/common/thresholds/thresholds.hpp" - -template -static inline Quality evaluateOutOfSampleInternally(IndexIterator indexIterator, uint32 numExamples, - const WeightVector& weights, const CoverageMask& coverageMask, - const IStatistics& statistics, const IPrediction& prediction) { - OutOfSampleWeightVector outOfSampleWeights(weights); - std::unique_ptr statisticsSubsetPtr = - prediction.createStatisticsSubset(statistics, outOfSampleWeights); - - for (uint32 i = 0; i < numExamples; i++) { - uint32 exampleIndex = indexIterator[i]; - - if (statisticsSubsetPtr->hasNonZeroWeight(exampleIndex) && coverageMask.isCovered(exampleIndex)) { - statisticsSubsetPtr->addToSubset(exampleIndex); - } - } - - return statisticsSubsetPtr->calculateScores(); -} - -template -static inline Quality evaluateOutOfSampleInternally(const WeightVector& weights, const CoverageSet& coverageSet, - const IStatistics& statistics, const IPrediction& prediction) { - OutOfSampleWeightVector outOfSampleWeights(weights); - std::unique_ptr statisticsSubsetPtr = - prediction.createStatisticsSubset(statistics, outOfSampleWeights); - uint32 numCovered = coverageSet.getNumCovered(); - CoverageSet::const_iterator coverageSetIterator = coverageSet.cbegin(); - - for (uint32 i = 0; i < numCovered; i++) { - uint32 exampleIndex = coverageSetIterator[i]; - - if (statisticsSubsetPtr->hasNonZeroWeight(exampleIndex)) { - statisticsSubsetPtr->addToSubset(exampleIndex); - } - } - - return statisticsSubsetPtr->calculateScores(); -} - -template -static inline Quality evaluateOutOfSampleInternally(const WeightVector& weights, const CoverageSet& coverageSet, - BiPartition& partition, const IStatistics& statistics, - const IPrediction& prediction) { - OutOfSampleWeightVector outOfSampleWeights(weights); - std::unique_ptr statisticsSubsetPtr = - prediction.createStatisticsSubset(statistics, outOfSampleWeights); - uint32 numCovered = coverageSet.getNumCovered(); - CoverageSet::const_iterator coverageSetIterator = coverageSet.cbegin(); - partition.sortSecond(); - auto holdoutSetIterator = make_binary_forward_iterator(partition.second_cbegin(), partition.second_cend()); - uint32 previousExampleIndex = 0; - - for (uint32 i = 0; i < numCovered; i++) { - uint32 exampleIndex = coverageSetIterator[i]; - std::advance(holdoutSetIterator, exampleIndex - previousExampleIndex); - - if (*holdoutSetIterator && statisticsSubsetPtr->hasNonZeroWeight(exampleIndex)) { - statisticsSubsetPtr->addToSubset(exampleIndex); - } - - previousExampleIndex = exampleIndex; - } - - return statisticsSubsetPtr->calculateScores(); -} - -template -static inline void recalculatePredictionInternally(IndexIterator indexIterator, uint32 numExamples, - const CoverageMask& coverageMask, const IStatistics& statistics, - IPrediction& prediction) { - EqualWeightVector weights(numExamples); - std::unique_ptr statisticsSubsetPtr = prediction.createStatisticsSubset(statistics, weights); - - for (uint32 i = 0; i < numExamples; i++) { - uint32 exampleIndex = indexIterator[i]; - - if (coverageMask.isCovered(exampleIndex)) { - statisticsSubsetPtr->addToSubset(exampleIndex); - } - } - - const IScoreVector& scoreVector = statisticsSubsetPtr->calculateScores(); - scoreVector.updatePrediction(prediction); -} - -static inline void recalculatePredictionInternally(const CoverageSet& coverageSet, const IStatistics& statistics, - IPrediction& prediction) { - uint32 numStatistics = statistics.getNumStatistics(); - EqualWeightVector weights(numStatistics); - std::unique_ptr statisticsSubsetPtr = prediction.createStatisticsSubset(statistics, weights); - uint32 numCovered = coverageSet.getNumCovered(); - CoverageSet::const_iterator coverageSetIterator = coverageSet.cbegin(); - - for (uint32 i = 0; i < numCovered; i++) { - uint32 exampleIndex = coverageSetIterator[i]; - statisticsSubsetPtr->addToSubset(exampleIndex); - } - - const IScoreVector& scoreVector = statisticsSubsetPtr->calculateScores(); - scoreVector.updatePrediction(prediction); -} - -static inline void recalculatePredictionInternally(const CoverageSet& coverageSet, BiPartition& partition, - const IStatistics& statistics, IPrediction& prediction) { - uint32 numStatistics = statistics.getNumStatistics(); - EqualWeightVector weights(numStatistics); - std::unique_ptr statisticsSubsetPtr = prediction.createStatisticsSubset(statistics, weights); - uint32 numCovered = coverageSet.getNumCovered(); - CoverageSet::const_iterator coverageSetIterator = coverageSet.cbegin(); - partition.sortSecond(); - auto holdoutSetIterator = make_binary_forward_iterator(partition.second_cbegin(), partition.second_cend()); - uint32 previousExampleIndex = 0; - - for (uint32 i = 0; i < numCovered; i++) { - uint32 exampleIndex = coverageSetIterator[i]; - std::advance(holdoutSetIterator, exampleIndex - previousExampleIndex); - - if (*holdoutSetIterator) { - statisticsSubsetPtr->addToSubset(exampleIndex); - } - - previousExampleIndex = exampleIndex; - } - - const IScoreVector& scoreVector = statisticsSubsetPtr->calculateScores(); - scoreVector.updatePrediction(prediction); -} - -/** - * An abstract base class for all classes that provide access to thresholds that may be used by the first condition of a - * rule that currently has an empty body and therefore covers the entire instance space. - */ -class AbstractThresholds : public IThresholds { - protected: - - /** - * A reference to an object of type `IColumnWiseFeatureMatrix` that provides column-wise access to the feature - * values of the training examples. - */ - const IColumnWiseFeatureMatrix& featureMatrix_; - - /** - * A reference to an object of type `IFeatureInfo` that provides information about the types of individual - * features. - */ - const IFeatureInfo& featureInfo_; - - /** - * A reference to an object of type `IStatisticsProvider` that provides access to statistics about the labels of - * the training examples. - */ - IStatisticsProvider& statisticsProvider_; - - public: - - /** - * @param featureMatrix A reference to an object of type `IColumnWiseFeatureMatrix` that provides - * column-wise access to the feature values of individual training examples - * @param featureInfo A reference to an object of type `IFeatureInfo` that provides information about - * the types of individual features - * @param statisticsProvider A reference to an object of type `IStatisticsProvider` that provides access to - * statistics about the labels of the training examples - */ - AbstractThresholds(const IColumnWiseFeatureMatrix& featureMatrix, const IFeatureInfo& featureInfo, - IStatisticsProvider& statisticsProvider) - : featureMatrix_(featureMatrix), featureInfo_(featureInfo), statisticsProvider_(statisticsProvider) {} - - virtual ~AbstractThresholds() override {} - - IStatisticsProvider& getStatisticsProvider() const override final { - return statisticsProvider_; - } -}; diff --git a/cpp/subprojects/common/test/mlrl/common/input/feature_binning_equal_frequency.cpp b/cpp/subprojects/common/test/mlrl/common/input/feature_binning_equal_frequency.cpp new file mode 100644 index 0000000000..79e865bd46 --- /dev/null +++ b/cpp/subprojects/common/test/mlrl/common/input/feature_binning_equal_frequency.cpp @@ -0,0 +1,183 @@ +#include "mlrl/common/input/feature_binning_equal_frequency.hpp" + +#include "mlrl/common/input/feature_binning_equal_frequency.cpp" +#include "mlrl/common/input/feature_vector_equal.hpp" + +#include + +TEST(EqualFrequencyFeatureBinningTest, createBinnedFeatureVectorFromFortranContiguousView) { + // Initialize feature matrix... + uint32 numExamples = 7; + AllocatedFortranContiguousView featureView(numExamples, 1); + AllocatedFortranContiguousView::value_iterator features = featureView.values_begin(0); + features[0] = 0.2; + features[1] = -0.1; + features[2] = NAN; + features[3] = -0.2; + features[4] = 0.0; + features[5] = NAN; + features[6] = 0.1; + FortranContiguousView view(features, numExamples, 1); + + // Create feature vector... + std::unique_ptr featureVectorPtr = + EqualFrequencyFeatureBinning(0.5, 1, 0).createFeatureVector(0, view); + + // Check type of feature vector... + const AbstractFeatureVectorDecorator* featureVectorDecorator = + dynamic_cast*>(featureVectorPtr.get()); + EXPECT_TRUE(featureVectorDecorator != nullptr); + + if (featureVectorDecorator) { + // Check for missing feature values... + const MissingFeatureVector& missingFeatureVector = featureVectorDecorator->getView().secondView; + EXPECT_TRUE(missingFeatureVector[2]); + EXPECT_TRUE(missingFeatureVector[5]); + + // Check dimensionality of feature vector... + const BinnedFeatureVector& featureVector = featureVectorDecorator->getView().firstView; + EXPECT_EQ(featureVector.numBins, (uint32) 3); + EXPECT_EQ(featureVector.sparseBinIndex, (uint32) 0); + + // Check thresholds and indices associated with each bin... + BinnedFeatureVector::threshold_const_iterator thresholdIterator = featureVector.thresholds_cbegin(); + BinnedFeatureVector::index_const_iterator indexIterator = featureVector.indices_cbegin(0); + BinnedFeatureVector::index_const_iterator indicesEnd = featureVector.indices_cend(0); + uint32 numIndices = indicesEnd - indexIterator; + EXPECT_EQ(numIndices, (uint32) 2); + EXPECT_EQ(indexIterator[0], (uint32) 3); + EXPECT_EQ(indexIterator[1], (uint32) 1); + EXPECT_FLOAT_EQ(thresholdIterator[0], arithmeticMean(features[1], features[4])); + + indexIterator = featureVector.indices_cbegin(1); + indicesEnd = featureVector.indices_cend(1); + numIndices = indicesEnd - indexIterator; + EXPECT_EQ(numIndices, (uint32) 2); + EXPECT_EQ(indexIterator[0], (uint32) 4); + EXPECT_EQ(indexIterator[1], (uint32) 6); + EXPECT_FLOAT_EQ(thresholdIterator[1], arithmeticMean(features[6], features[0])); + + indexIterator = featureVector.indices_cbegin(2); + indicesEnd = featureVector.indices_cend(2); + numIndices = indicesEnd - indexIterator; + EXPECT_EQ(numIndices, (uint32) 1); + EXPECT_EQ(indexIterator[0], (uint32) 0); + } +} + +TEST(EqualFrequencyFeatureBinningTest, createEqualFeatureVectorFromFortranContiguousView) { + // Initialize feature matrix... + uint32 numExamples = 1; + AllocatedFortranContiguousView featureView(numExamples, 1); + AllocatedFortranContiguousView::value_iterator features = featureView.values_begin(0); + features[0] = 0.0; + FortranContiguousView view(features, numExamples, 1); + + // Create feature vector... + std::unique_ptr featureVectorPtr = + EqualFrequencyFeatureBinning(0.5, 1, 0).createFeatureVector(0, view); + + // Check type of feature vector... + const EqualFeatureVector* featureVector = dynamic_cast(featureVectorPtr.get()); + EXPECT_TRUE(featureVector != nullptr); +} + +TEST(EqualFrequencyFeatureBinningTest, createBinnedFeatureVectorFromCscView) { + // Initialize feature matrix... + uint32 numDense = 7; + float32* data = new float32[numDense]; + uint32* rowIndices = new uint32[numDense]; + data[0] = 0.2; + rowIndices[0] = 0; + data[1] = -0.1; + rowIndices[1] = 2; + data[2] = NAN; + rowIndices[2] = 3; + data[3] = -0.2; + rowIndices[3] = 5; + data[4] = 0.0; + rowIndices[4] = 6; + data[5] = NAN; + rowIndices[5] = 7; + data[6] = 0.1; + rowIndices[6] = 9; + uint32* indptr = new uint32[2]; + indptr[0] = 0; + indptr[1] = numDense; + CscView view(data, rowIndices, indptr, numDense + 3, 1); + + // Create feature vector... + std::unique_ptr featureVectorPtr = + EqualFrequencyFeatureBinning(0.3, 1, 0).createFeatureVector(0, view); + + // Check type of feature vector... + const AbstractFeatureVectorDecorator* featureVectorDecorator = + dynamic_cast*>(featureVectorPtr.get()); + EXPECT_TRUE(featureVectorDecorator != nullptr); + + if (featureVectorDecorator) { + // Check for missing feature values... + const MissingFeatureVector& missingFeatureVector = featureVectorDecorator->getView().secondView; + EXPECT_TRUE(missingFeatureVector[3]); + EXPECT_TRUE(missingFeatureVector[7]); + + // Check dimensionality of feature vector... + const BinnedFeatureVector& featureVector = featureVectorDecorator->getView().firstView; + EXPECT_EQ(featureVector.numBins, (uint32) 3); + EXPECT_EQ(featureVector.sparseBinIndex, (uint32) 1); + + // Check thresholds and indices associated with each bin... + BinnedFeatureVector::threshold_const_iterator thresholdIterator = featureVector.thresholds_cbegin(); + BinnedFeatureVector::index_const_iterator indexIterator = featureVector.indices_cbegin(0); + BinnedFeatureVector::index_const_iterator indicesEnd = featureVector.indices_cend(0); + uint32 numIndices = indicesEnd - indexIterator; + EXPECT_EQ(numIndices, (uint32) 2); + EXPECT_EQ(indexIterator[0], (uint32) 5); + EXPECT_EQ(indexIterator[1], (uint32) 2); + EXPECT_FLOAT_EQ(thresholdIterator[0], arithmeticMean(data[1], 0.0f)); + + indexIterator = featureVector.indices_cbegin(1); + indicesEnd = featureVector.indices_cend(1); + numIndices = indicesEnd - indexIterator; + EXPECT_EQ(numIndices, (uint32) 0); + EXPECT_FLOAT_EQ(thresholdIterator[1], arithmeticMean(0.0f, data[6])); + + indexIterator = featureVector.indices_cbegin(2); + indicesEnd = featureVector.indices_cend(2); + numIndices = indicesEnd - indexIterator; + EXPECT_EQ(numIndices, (uint32) 2); + EXPECT_EQ(indexIterator[0], (uint32) 9); + EXPECT_EQ(indexIterator[1], (uint32) 0); + } + + delete[] data; + delete[] rowIndices; + delete[] indptr; +} + +TEST(EqualFrequencyFeatureBinningTest, createEqualFeatureVectorFromCscView) { + // Initialize feature matrix... + uint32 numDense = 2; + float32* data = new float32[numDense]; + uint32* rowIndices = new uint32[numDense]; + data[0] = 0.1; + rowIndices[0] = 0; + data[1] = 0.1; + rowIndices[1] = 1; + uint32* indptr = new uint32[2]; + indptr[0] = 0; + indptr[1] = numDense; + CscView view(data, rowIndices, indptr, numDense, 1); + + // Create feature vector... + std::unique_ptr featureVectorPtr = + EqualFrequencyFeatureBinning(0.3, 1, 0).createFeatureVector(0, view); + + // Check type of feature vector... + const EqualFeatureVector* featureVector = dynamic_cast(featureVectorPtr.get()); + EXPECT_TRUE(featureVector != nullptr); + + delete[] data; + delete[] rowIndices; + delete[] indptr; +} diff --git a/cpp/subprojects/common/test/mlrl/common/input/feature_binning_equal_width.cpp b/cpp/subprojects/common/test/mlrl/common/input/feature_binning_equal_width.cpp new file mode 100644 index 0000000000..39ecbe5f48 --- /dev/null +++ b/cpp/subprojects/common/test/mlrl/common/input/feature_binning_equal_width.cpp @@ -0,0 +1,178 @@ +#include "mlrl/common/input/feature_binning_equal_width.hpp" + +#include "mlrl/common/input/feature_binning_equal_width.cpp" +#include "mlrl/common/input/feature_vector_equal.hpp" + +#include + +TEST(EqualWidthFeatureBinningTest, createBinnedFeatureVectorFromFortranContiguousView) { + // Initialize feature matrix... + uint32 numExamples = 7; + AllocatedFortranContiguousView featureView(numExamples, 1); + AllocatedFortranContiguousView::value_iterator features = featureView.values_begin(0); + features[0] = 0.2; + features[1] = -0.1; + features[2] = NAN; + features[3] = -0.2; + features[4] = 0.0; + features[5] = NAN; + features[6] = 0.1; + FortranContiguousView view(features, numExamples, 1); + + // Create feature vector... + std::unique_ptr featureVectorPtr = EqualWidthFeatureBinning(0.4, 1, 0).createFeatureVector(0, view); + + // Check type of feature vector... + const AbstractFeatureVectorDecorator* featureVectorDecorator = + dynamic_cast*>(featureVectorPtr.get()); + EXPECT_TRUE(featureVectorDecorator != nullptr); + + if (featureVectorDecorator) { + // Check for missing feature values... + const MissingFeatureVector& missingFeatureVector = featureVectorDecorator->getView().secondView; + EXPECT_TRUE(missingFeatureVector[2]); + EXPECT_TRUE(missingFeatureVector[5]); + + // Check dimensionality of feature vector... + const BinnedFeatureVector& featureVector = featureVectorDecorator->getView().firstView; + EXPECT_EQ(featureVector.numBins, (uint32) 3); + EXPECT_EQ(featureVector.sparseBinIndex, (uint32) 1); + + // Check thresholds and indices associated with each bin... + BinnedFeatureVector::threshold_const_iterator thresholdIterator = featureVector.thresholds_cbegin(); + BinnedFeatureVector::index_const_iterator indexIterator = featureVector.indices_cbegin(0); + BinnedFeatureVector::index_const_iterator indicesEnd = featureVector.indices_cend(0); + uint32 numIndices = indicesEnd - indexIterator; + EXPECT_EQ(numIndices, (uint32) 2); + EXPECT_EQ(indexIterator[0], (uint32) 1); + EXPECT_EQ(indexIterator[1], (uint32) 3); + EXPECT_FLOAT_EQ(thresholdIterator[0], -0.066666663f); + + indexIterator = featureVector.indices_cbegin(1); + indicesEnd = featureVector.indices_cend(1); + numIndices = indicesEnd - indexIterator; + EXPECT_EQ(numIndices, (uint32) 0); + EXPECT_FLOAT_EQ(thresholdIterator[1], 0.066666678f); + + indexIterator = featureVector.indices_cbegin(2); + indicesEnd = featureVector.indices_cend(2); + numIndices = indicesEnd - indexIterator; + EXPECT_EQ(numIndices, (uint32) 2); + EXPECT_EQ(indexIterator[0], (uint32) 0); + EXPECT_EQ(indexIterator[1], (uint32) 6); + } +} + +TEST(EqualWidthFeatureBinningTest, createEqualFeatureVectorFromFortranContiguousView) { + // Initialize feature matrix... + uint32 numExamples = 1; + AllocatedFortranContiguousView featureView(numExamples, 1); + AllocatedFortranContiguousView::value_iterator features = featureView.values_begin(0); + features[0] = 0.0; + FortranContiguousView view(features, numExamples, 1); + + // Create feature vector... + std::unique_ptr featureVectorPtr = EqualWidthFeatureBinning(0.4, 1, 0).createFeatureVector(0, view); + + // Check type of feature vector... + const EqualFeatureVector* featureVector = dynamic_cast(featureVectorPtr.get()); + EXPECT_TRUE(featureVector != nullptr); +} + +TEST(EqualWidthFeatureBinningTest, createBinnedFeatureVectorFromCscView) { + // Initialize feature matrix... + uint32 numDense = 7; + float32* data = new float32[numDense]; + uint32* rowIndices = new uint32[numDense]; + data[0] = 0.2; + rowIndices[0] = 0; + data[1] = -0.1; + rowIndices[1] = 2; + data[2] = NAN; + rowIndices[2] = 3; + data[3] = -0.2; + rowIndices[3] = 5; + data[4] = 0.0; + rowIndices[4] = 6; + data[5] = NAN; + rowIndices[5] = 7; + data[6] = 0.1; + rowIndices[6] = 9; + uint32* indptr = new uint32[2]; + indptr[0] = 0; + indptr[1] = numDense; + CscView view(data, rowIndices, indptr, numDense + 3, 1); + + // Create feature vector... + std::unique_ptr featureVectorPtr = EqualWidthFeatureBinning(0.3, 1, 0).createFeatureVector(0, view); + + // Check type of feature vector... + const AbstractFeatureVectorDecorator* featureVectorDecorator = + dynamic_cast*>(featureVectorPtr.get()); + EXPECT_TRUE(featureVectorDecorator != nullptr); + + if (featureVectorDecorator) { + // Check for missing feature values... + const MissingFeatureVector& missingFeatureVector = featureVectorDecorator->getView().secondView; + EXPECT_TRUE(missingFeatureVector[3]); + EXPECT_TRUE(missingFeatureVector[7]); + + // Check dimensionality of feature vector... + const BinnedFeatureVector& featureVector = featureVectorDecorator->getView().firstView; + EXPECT_EQ(featureVector.numBins, (uint32) 3); + EXPECT_EQ(featureVector.sparseBinIndex, (uint32) 1); + + // Check thresholds and indices associated with each bin... + BinnedFeatureVector::threshold_const_iterator thresholdIterator = featureVector.thresholds_cbegin(); + BinnedFeatureVector::index_const_iterator indexIterator = featureVector.indices_cbegin(0); + BinnedFeatureVector::index_const_iterator indicesEnd = featureVector.indices_cend(0); + uint32 numIndices = indicesEnd - indexIterator; + EXPECT_EQ(numIndices, (uint32) 2); + EXPECT_EQ(indexIterator[0], (uint32) 2); + EXPECT_EQ(indexIterator[1], (uint32) 5); + EXPECT_FLOAT_EQ(thresholdIterator[0], -0.066666663f); + + indexIterator = featureVector.indices_cbegin(1); + indicesEnd = featureVector.indices_cend(1); + numIndices = indicesEnd - indexIterator; + EXPECT_EQ(numIndices, (uint32) 0); + EXPECT_FLOAT_EQ(thresholdIterator[1], 0.066666678f); + + indexIterator = featureVector.indices_cbegin(2); + indicesEnd = featureVector.indices_cend(2); + numIndices = indicesEnd - indexIterator; + EXPECT_EQ(numIndices, (uint32) 2); + EXPECT_EQ(indexIterator[0], (uint32) 0); + EXPECT_EQ(indexIterator[1], (uint32) 9); + } + + delete[] data; + delete[] rowIndices; + delete[] indptr; +} + +TEST(EqualWidthFeatureBinningTest, createEqualFeatureVectorFromCscView) { + // Initialize feature matrix... + uint32 numDense = 2; + float32* data = new float32[numDense]; + uint32* rowIndices = new uint32[numDense]; + data[0] = 0.1; + rowIndices[0] = 0; + data[1] = 0.1; + rowIndices[1] = 1; + uint32* indptr = new uint32[2]; + indptr[0] = 0; + indptr[1] = numDense; + CscView view(data, rowIndices, indptr, numDense, 1); + + // Create feature vector... + std::unique_ptr featureVectorPtr = EqualWidthFeatureBinning(0.4, 1, 0).createFeatureVector(0, view); + + // Check type of feature vector... + const EqualFeatureVector* featureVector = dynamic_cast(featureVectorPtr.get()); + EXPECT_TRUE(featureVector != nullptr); + + delete[] data; + delete[] rowIndices; + delete[] indptr; +} diff --git a/cpp/subprojects/common/test/mlrl/common/input/feature_vector_decorator_binary.cpp b/cpp/subprojects/common/test/mlrl/common/input/feature_vector_decorator_binary.cpp index 160cecf219..04c7444e72 100644 --- a/cpp/subprojects/common/test/mlrl/common/input/feature_vector_decorator_binary.cpp +++ b/cpp/subprojects/common/test/mlrl/common/input/feature_vector_decorator_binary.cpp @@ -26,17 +26,17 @@ TEST(BinaryFeatureVectorDecoratorTest, updateCoverageMaskAndStatistics) { CoverageMask coverageMask(numExamples); uint32 indicatorValue = 1; decorator.updateCoverageMaskAndStatistics(interval, coverageMask, indicatorValue, statistics); - EXPECT_EQ(coverageMask.getIndicatorValue(), indicatorValue); + EXPECT_EQ(coverageMask.indicatorValue, indicatorValue); const BinaryFeatureVector& binaryFeatureVector = decorator.getView().firstView; for (auto it = binaryFeatureVector.indices_cbegin(0); it != binaryFeatureVector.indices_cend(0); it++) { uint32 index = *it; - EXPECT_TRUE(coverageMask.isCovered(index)); + EXPECT_TRUE(coverageMask[index]); EXPECT_TRUE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); } for (uint32 i = numMinorityExamples; i < numExamples; i++) { - EXPECT_FALSE(coverageMask.isCovered(i)); + EXPECT_FALSE(coverageMask[i]); EXPECT_FALSE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); } } @@ -69,22 +69,22 @@ TEST(BinaryFeatureVectorDecoratorTest, updateCoverageMaskAndStatisticsInverse) { CoverageMask coverageMask(numExamples); uint32 indicatorValue = 1; decorator.updateCoverageMaskAndStatistics(interval, coverageMask, indicatorValue, statistics); - EXPECT_EQ(coverageMask.getIndicatorValue(), (uint32) 0); + EXPECT_EQ(coverageMask.indicatorValue, (uint32) 0); const BinaryFeatureVector& binaryFeatureVector = decorator.getView().firstView; for (auto it = binaryFeatureVector.indices_cbegin(0); it != binaryFeatureVector.indices_cend(0); it++) { uint32 index = *it; - EXPECT_FALSE(coverageMask.isCovered(index)); + EXPECT_FALSE(coverageMask[index]); EXPECT_FALSE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); } for (uint32 i = numMinorityExamples; i < numMinorityExamples + numMissingExamples; i++) { - EXPECT_FALSE(coverageMask.isCovered(i)); + EXPECT_FALSE(coverageMask[i]); EXPECT_FALSE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); } for (uint32 i = numMinorityExamples + numMissingExamples; i < numExamples; i++) { - EXPECT_TRUE(coverageMask.isCovered(i)); + EXPECT_TRUE(coverageMask[i]); EXPECT_TRUE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); } } @@ -116,7 +116,7 @@ TEST(BinaryFeatureVectorDecoratorTest, createFilteredFeatureVectorFromCoverageMa CoverageMask coverageMask(numExamples); uint32 indicatorValue = 1; - coverageMask.setIndicatorValue(indicatorValue); + coverageMask.indicatorValue = indicatorValue; CoverageMask::iterator coverageMaskIterator = coverageMask.begin(); for (uint32 i = 0; i < numExamples; i++) { @@ -185,7 +185,7 @@ TEST(BinaryFeatureVectorDecoratorTest, createFilteredFeatureVectorFromCoverageMa CoverageMask coverageMask(numExamples); uint32 indicatorValue = 1; - coverageMask.setIndicatorValue(indicatorValue); + coverageMask.indicatorValue = indicatorValue; CoverageMask::iterator coverageMaskIterator = coverageMask.begin(); for (uint32 i = 0; i < numExamples; i++) { @@ -246,7 +246,7 @@ TEST(BinaryFeatureVectorDecoratorTest, createFilteredFeatureVectorFromCoverageMa } CoverageMask coverageMask(numMinorityExamples); - coverageMask.setIndicatorValue(1); + coverageMask.indicatorValue = 1; BinaryFeatureVectorDecorator decorator(std::move(featureVector), AllocatedMissingFeatureVector()); std::unique_ptr existing; diff --git a/cpp/subprojects/common/test/mlrl/common/input/feature_vector_decorator_binned.cpp b/cpp/subprojects/common/test/mlrl/common/input/feature_vector_decorator_binned.cpp new file mode 100644 index 0000000000..dd61d9eb6c --- /dev/null +++ b/cpp/subprojects/common/test/mlrl/common/input/feature_vector_decorator_binned.cpp @@ -0,0 +1,763 @@ +#include "mlrl/common/input/feature_vector_decorator_binned.hpp" + +#include "mlrl/common/input/feature_vector_binned.hpp" +#include "statistics_weighted.hpp" + +#include + +TEST(BinnedFeatureVectorDecoratorTest, updateCoverageMaskAndStatistics) { + uint32 numBins = 4; + uint32 numExamplesPerBin = 10; + uint32 numMinorityExamples = numBins * numExamplesPerBin; + AllocatedBinnedFeatureVector featureVector(numBins, numMinorityExamples, 0); + AllocatedBinnedFeatureVector::threshold_iterator thresholdIterator = featureVector.thresholds; + AllocatedBinnedFeatureVector::index_iterator indptrIterator = featureVector.indptr; + AllocatedBinnedFeatureVector::index_iterator indexIterator = featureVector.indices_begin(0); + + for (uint32 i = 0; i < numBins; i++) { + if (i < numBins - 1) { + thresholdIterator[i] = i; + } + + indptrIterator[i] = i * numExamplesPerBin; + + for (uint32 j = 0; j < numExamplesPerBin; j++) { + uint32 index = (i * numExamplesPerBin) + j; + indexIterator[index] = index; + } + } + + WeightedStatistics statistics; + uint32 numExamples = numMinorityExamples + 15; + + for (uint32 i = 0; i < numExamples; i++) { + statistics.addCoveredStatistic(i); + } + + BinnedFeatureVectorDecorator decorator(std::move(featureVector), AllocatedMissingFeatureVector()); + Interval interval(2, numBins); + CoverageMask coverageMask(numExamples); + uint32 indicatorValue = 1; + decorator.updateCoverageMaskAndStatistics(interval, coverageMask, indicatorValue, statistics); + EXPECT_EQ(coverageMask.indicatorValue, indicatorValue); + const BinnedFeatureVector& binnedFeatureVector = decorator.getView().firstView; + + for (uint32 i = 0; i < interval.start; i++) { + for (auto it = binnedFeatureVector.indices_cbegin(i); it != binnedFeatureVector.indices_cend(i); it++) { + uint32 index = *it; + EXPECT_FALSE(coverageMask[index]); + EXPECT_FALSE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); + } + } + + for (uint32 i = interval.start; i < interval.end; i++) { + for (auto it = binnedFeatureVector.indices_cbegin(i); it != binnedFeatureVector.indices_cend(i); it++) { + uint32 index = *it; + EXPECT_TRUE(coverageMask[index]); + EXPECT_TRUE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); + } + } + + for (uint32 i = interval.end; i < numBins; i++) { + for (auto it = binnedFeatureVector.indices_cbegin(i); it != binnedFeatureVector.indices_cend(i); it++) { + uint32 index = *it; + EXPECT_FALSE(coverageMask[index]); + EXPECT_FALSE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); + } + } + + for (uint32 i = numMinorityExamples; i < numExamples; i++) { + EXPECT_FALSE(coverageMask[i]); + EXPECT_FALSE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); + } +} + +TEST(BinnedFeatureVectorDecoratorTest, updateCoverageMaskAndStatisticsInverse) { + uint32 numBins = 4; + uint32 numExamplesPerBin = 10; + uint32 numMinorityExamples = numBins * numExamplesPerBin; + AllocatedBinnedFeatureVector featureVector(numBins, numMinorityExamples, 0); + AllocatedBinnedFeatureVector::threshold_iterator thresholdIterator = featureVector.thresholds; + AllocatedBinnedFeatureVector::index_iterator indptrIterator = featureVector.indptr; + AllocatedBinnedFeatureVector::index_iterator indexIterator = featureVector.indices_begin(0); + + for (uint32 i = 0; i < numBins; i++) { + if (i < numBins - 1) { + thresholdIterator[i] = i; + } + + indptrIterator[i] = i * numExamplesPerBin; + + for (uint32 j = 0; j < numExamplesPerBin; j++) { + uint32 index = (i * numExamplesPerBin) + j; + indexIterator[index] = index; + } + } + + AllocatedMissingFeatureVector missingFeatureVector; + uint32 numMissingExamples = 5; + + for (uint32 i = numMinorityExamples; i < numMinorityExamples + numMissingExamples; i++) { + missingFeatureVector.set(i, true); + } + + WeightedStatistics statistics; + uint32 numExamples = numMinorityExamples + numMissingExamples + 15; + + for (uint32 i = 0; i < numExamples; i++) { + statistics.addCoveredStatistic(i); + } + + BinnedFeatureVectorDecorator decorator(std::move(featureVector), std::move(missingFeatureVector)); + Interval interval(2, numBins, true); + CoverageMask coverageMask(numExamples); + uint32 indicatorValue = 1; + decorator.updateCoverageMaskAndStatistics(interval, coverageMask, indicatorValue, statistics); + EXPECT_EQ(coverageMask.indicatorValue, (uint32) 0); + const BinnedFeatureVector& binnedFeatureVector = decorator.getView().firstView; + + for (uint32 i = 0; i < interval.start; i++) { + for (auto it = binnedFeatureVector.indices_cbegin(i); it != binnedFeatureVector.indices_cend(i); it++) { + uint32 index = *it; + EXPECT_TRUE(coverageMask[index]); + EXPECT_TRUE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); + } + } + + for (uint32 i = interval.start; i < interval.end; i++) { + for (auto it = binnedFeatureVector.indices_cbegin(i); it != binnedFeatureVector.indices_cend(i); it++) { + uint32 index = *it; + EXPECT_FALSE(coverageMask[index]); + EXPECT_FALSE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); + } + } + + for (uint32 i = interval.end; i < numBins; i++) { + for (auto it = binnedFeatureVector.indices_cbegin(i); it != binnedFeatureVector.indices_cend(i); it++) { + uint32 index = *it; + EXPECT_TRUE(coverageMask[index]); + EXPECT_TRUE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); + } + } + + for (uint32 i = numMinorityExamples; i < numMinorityExamples + numMissingExamples; i++) { + EXPECT_FALSE(coverageMask[i]); + EXPECT_FALSE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); + } + + for (uint32 i = numMinorityExamples + numMissingExamples; i < numExamples; i++) { + EXPECT_TRUE(coverageMask[i]); + EXPECT_TRUE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); + } +} + +TEST(BinnedFeatureVectorDecoratorTest, updateCoverageMaskAndStatisticsFromView) { + uint32 numBins = 4; + uint32 numExamplesPerBin = 10; + uint32 numMinorityExamples = numBins * numExamplesPerBin; + AllocatedBinnedFeatureVector featureVector(numBins, numMinorityExamples, 0); + AllocatedBinnedFeatureVector::threshold_iterator thresholdIterator = featureVector.thresholds; + AllocatedBinnedFeatureVector::index_iterator indptrIterator = featureVector.indptr; + AllocatedBinnedFeatureVector::index_iterator indexIterator = featureVector.indices_begin(0); + + for (uint32 i = 0; i < numBins; i++) { + if (i < numBins - 1) { + thresholdIterator[i] = i; + } + + indptrIterator[i] = i * numExamplesPerBin; + + for (uint32 j = 0; j < numExamplesPerBin; j++) { + uint32 index = (i * numExamplesPerBin) + j; + indexIterator[index] = index; + } + } + + WeightedStatistics statistics; + uint32 numExamples = numMinorityExamples + 15; + + for (uint32 i = 0; i < numExamples; i++) { + statistics.addCoveredStatistic(i); + } + + BinnedFeatureVectorDecorator decorator(std::move(featureVector), AllocatedMissingFeatureVector()); + Interval interval(2, numBins); + CoverageMask coverageMask(numExamples); + uint32 indicatorValue = 1; + std::unique_ptr existing; + decorator.createFilteredFeatureVector(existing, Interval(0, numBins)) + ->updateCoverageMaskAndStatistics(interval, coverageMask, indicatorValue, statistics); + EXPECT_EQ(coverageMask.indicatorValue, indicatorValue); + const BinnedFeatureVector& binnedFeatureVector = decorator.getView().firstView; + + for (uint32 i = 0; i < interval.start; i++) { + for (auto it = binnedFeatureVector.indices_cbegin(i); it != binnedFeatureVector.indices_cend(i); it++) { + uint32 index = *it; + EXPECT_FALSE(coverageMask[index]); + EXPECT_FALSE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); + } + } + + for (uint32 i = interval.start; i < interval.end; i++) { + for (auto it = binnedFeatureVector.indices_cbegin(i); it != binnedFeatureVector.indices_cend(i); it++) { + uint32 index = *it; + EXPECT_TRUE(coverageMask[index]); + EXPECT_TRUE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); + } + } + + for (uint32 i = interval.end; i < numBins; i++) { + for (auto it = binnedFeatureVector.indices_cbegin(i); it != binnedFeatureVector.indices_cend(i); it++) { + uint32 index = *it; + EXPECT_FALSE(coverageMask[index]); + EXPECT_FALSE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); + } + } + + for (uint32 i = numMinorityExamples; i < numExamples; i++) { + EXPECT_FALSE(coverageMask[i]); + EXPECT_FALSE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); + } +} + +TEST(BinnedFeatureVectorDecoratorTest, updateCoverageMaskAndStatisticsFromViewInverse) { + uint32 numBins = 4; + uint32 numExamplesPerBin = 10; + uint32 numMinorityExamples = numBins * numExamplesPerBin; + AllocatedBinnedFeatureVector featureVector(numBins, numMinorityExamples, 0); + AllocatedBinnedFeatureVector::threshold_iterator thresholdIterator = featureVector.thresholds; + AllocatedBinnedFeatureVector::index_iterator indptrIterator = featureVector.indptr; + AllocatedBinnedFeatureVector::index_iterator indexIterator = featureVector.indices_begin(0); + + for (uint32 i = 0; i < numBins; i++) { + if (i < numBins - 1) { + thresholdIterator[i] = i; + } + + indptrIterator[i] = i * numExamplesPerBin; + + for (uint32 j = 0; j < numExamplesPerBin; j++) { + uint32 index = (i * numExamplesPerBin) + j; + indexIterator[index] = index; + } + } + + WeightedStatistics statistics; + uint32 numExamples = numMinorityExamples + 15; + + for (uint32 i = 0; i < numExamples; i++) { + statistics.addCoveredStatistic(i); + } + + BinnedFeatureVectorDecorator decorator(std::move(featureVector), AllocatedMissingFeatureVector()); + Interval interval(2, numBins, true); + CoverageMask coverageMask(numExamples); + uint32 indicatorValue = 1; + std::unique_ptr existing; + decorator.createFilteredFeatureVector(existing, Interval(0, numBins)) + ->updateCoverageMaskAndStatistics(interval, coverageMask, indicatorValue, statistics); + EXPECT_EQ(coverageMask.indicatorValue, (uint32) 0); + const BinnedFeatureVector& binnedFeatureVector = decorator.getView().firstView; + + for (uint32 i = 0; i < interval.start; i++) { + for (auto it = binnedFeatureVector.indices_cbegin(i); it != binnedFeatureVector.indices_cend(i); it++) { + uint32 index = *it; + EXPECT_TRUE(coverageMask[index]); + EXPECT_TRUE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); + } + } + + for (uint32 i = interval.start; i < interval.end; i++) { + for (auto it = binnedFeatureVector.indices_cbegin(i); it != binnedFeatureVector.indices_cend(i); it++) { + uint32 index = *it; + EXPECT_FALSE(coverageMask[index]); + EXPECT_FALSE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); + } + } + + for (uint32 i = interval.end; i < numBins; i++) { + for (auto it = binnedFeatureVector.indices_cbegin(i); it != binnedFeatureVector.indices_cend(i); it++) { + uint32 index = *it; + EXPECT_TRUE(coverageMask[index]); + EXPECT_TRUE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); + } + } + + for (uint32 i = numMinorityExamples; i < numExamples; i++) { + EXPECT_TRUE(coverageMask[i]); + EXPECT_TRUE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); + } +} + +TEST(BinnedFeatureVectorDecoratorTest, createFilteredFeatureVectorFromIndices) { + uint32 numBins = 4; + uint32 numExamplesPerBin = 10; + uint32 numMinorityExamples = numBins * numExamplesPerBin; + AllocatedBinnedFeatureVector featureVector(numBins, numMinorityExamples, 0); + AllocatedBinnedFeatureVector::threshold_iterator thresholdIterator = featureVector.thresholds; + AllocatedBinnedFeatureVector::index_iterator indptrIterator = featureVector.indptr; + AllocatedBinnedFeatureVector::index_iterator indexIterator = featureVector.indices_begin(0); + + for (uint32 i = 0; i < numBins; i++) { + if (i < numBins - 1) { + thresholdIterator[i] = i; + } + + indptrIterator[i] = i * numExamplesPerBin; + + for (uint32 j = 0; j < numExamplesPerBin; j++) { + uint32 index = (i * numExamplesPerBin) + j; + indexIterator[index] = index; + } + } + + BinnedFeatureVectorDecorator decorator(std::move(featureVector), AllocatedMissingFeatureVector()); + std::unique_ptr existing; + Interval interval(2, numBins); + std::unique_ptr filtered = decorator.createFilteredFeatureVector(existing, interval); + const BinnedFeatureVectorView* filteredDecorator = dynamic_cast(filtered.get()); + EXPECT_TRUE(filteredDecorator != nullptr); + + if (filteredDecorator) { + // Check filtered indices... + const BinnedFeatureVector& filteredFeatureVector = filteredDecorator->getView().firstView; + EXPECT_EQ(filteredFeatureVector.numBins, interval.end - interval.start); + BinnedFeatureVector::threshold_const_iterator thresholdsBegin = filteredFeatureVector.thresholds_cbegin(); + + for (uint32 i = 0; i < filteredFeatureVector.numBins; i++) { + if (i < filteredFeatureVector.numBins - 1) { + EXPECT_EQ(thresholdsBegin[i], (int32) (interval.start + i)); + } + + BinnedFeatureVector::index_const_iterator indicesBegin = filteredFeatureVector.indices_cbegin(i); + BinnedFeatureVector::index_const_iterator indicesEnd = filteredFeatureVector.indices_cend(i); + uint32 numIndices = indicesEnd - indicesBegin; + EXPECT_EQ(numIndices, numExamplesPerBin); + + for (uint32 j = 0; j < numIndices; j++) { + EXPECT_EQ(indicesBegin[j], ((i + interval.start) * numExamplesPerBin) + j); + } + } + } +} + +TEST(BinnedFeatureVectorDecoratorTest, createFilteredFeatureVectorFromViewWithIndices) { + uint32 numBins = 4; + uint32 numExamplesPerBin = 10; + uint32 numMinorityExamples = numBins * numExamplesPerBin; + AllocatedBinnedFeatureVector featureVector(numBins, numMinorityExamples, 0); + AllocatedBinnedFeatureVector::threshold_iterator thresholdIterator = featureVector.thresholds; + AllocatedBinnedFeatureVector::index_iterator indptrIterator = featureVector.indptr; + AllocatedBinnedFeatureVector::index_iterator indexIterator = featureVector.indices_begin(0); + + for (uint32 i = 0; i < numBins; i++) { + if (i < numBins - 1) { + thresholdIterator[i] = i; + } + + indptrIterator[i] = i * numExamplesPerBin; + + for (uint32 j = 0; j < numExamplesPerBin; j++) { + uint32 index = (i * numExamplesPerBin) + j; + indexIterator[index] = index; + } + } + + BinnedFeatureVectorDecorator decorator(std::move(featureVector), AllocatedMissingFeatureVector()); + std::unique_ptr existing; + Interval interval(2, numBins); + std::unique_ptr filtered = decorator.createFilteredFeatureVector(existing, Interval(0, numBins)) + ->createFilteredFeatureVector(existing, interval); + const BinnedFeatureVectorView* filteredDecorator = dynamic_cast(filtered.get()); + EXPECT_TRUE(filteredDecorator != nullptr); + + if (filteredDecorator) { + // Check filtered indices... + const BinnedFeatureVector& filteredFeatureVector = filteredDecorator->getView().firstView; + EXPECT_EQ(filteredFeatureVector.numBins, interval.end - interval.start); + BinnedFeatureVector::threshold_const_iterator thresholdsBegin = filteredFeatureVector.thresholds_cbegin(); + + for (uint32 i = 0; i < filteredFeatureVector.numBins; i++) { + if (i < filteredFeatureVector.numBins - 1) { + EXPECT_EQ(thresholdsBegin[i], (int32) (interval.start + i)); + } + + BinnedFeatureVector::index_const_iterator indicesBegin = filteredFeatureVector.indices_cbegin(i); + BinnedFeatureVector::index_const_iterator indicesEnd = filteredFeatureVector.indices_cend(i); + uint32 numIndices = indicesEnd - indicesBegin; + EXPECT_EQ(numIndices, numExamplesPerBin); + + for (uint32 j = 0; j < numIndices; j++) { + EXPECT_EQ(indicesBegin[j], ((i + interval.start) * numExamplesPerBin) + j); + } + } + } +} + +TEST(BinnedFeatureVectorDecoratorTest, createFilteredFeatureVectorFromIndicesInverse) { + uint32 numBins = 4; + uint32 numExamplesPerBin = 10; + uint32 numMinorityExamples = numBins * numExamplesPerBin; + AllocatedBinnedFeatureVector featureVector(numBins, numMinorityExamples, 0); + AllocatedBinnedFeatureVector::threshold_iterator thresholdIterator = featureVector.thresholds; + AllocatedBinnedFeatureVector::index_iterator indptrIterator = featureVector.indptr; + AllocatedBinnedFeatureVector::index_iterator indexIterator = featureVector.indices_begin(0); + + for (uint32 i = 0; i < numBins; i++) { + if (i < numBins - 1) { + thresholdIterator[i] = i; + } + indptrIterator[i] = i * numExamplesPerBin; + + for (uint32 j = 0; j < numExamplesPerBin; j++) { + uint32 index = (i * numExamplesPerBin) + j; + indexIterator[index] = index; + } + } + + BinnedFeatureVectorDecorator decorator(std::move(featureVector), AllocatedMissingFeatureVector()); + std::unique_ptr existing; + Interval interval(2, numBins, true); + std::unique_ptr filtered = decorator.createFilteredFeatureVector(existing, interval); + const BinnedFeatureVectorView* filteredDecorator = dynamic_cast(filtered.get()); + EXPECT_TRUE(filteredDecorator != nullptr); + + if (filteredDecorator) { + // Check filtered indices... + const BinnedFeatureVector& filteredFeatureVector = filteredDecorator->getView().firstView; + EXPECT_EQ(filteredFeatureVector.numBins, interval.end - interval.start); + BinnedFeatureVector::threshold_const_iterator thresholdsBegin = filteredFeatureVector.thresholds_cbegin(); + + for (uint32 i = 0; i < filteredFeatureVector.numBins; i++) { + EXPECT_EQ(thresholdsBegin[i], (int32) i); + BinnedFeatureVector::index_const_iterator indicesBegin = filteredFeatureVector.indices_cbegin(i); + BinnedFeatureVector::index_const_iterator indicesEnd = filteredFeatureVector.indices_cend(i); + uint32 numIndices = indicesEnd - indicesBegin; + EXPECT_EQ(numIndices, numExamplesPerBin); + + for (uint32 j = 0; j < numIndices; j++) { + EXPECT_EQ(indicesBegin[j], (i * numExamplesPerBin) + j); + } + } + } +} + +TEST(BinnedFeatureVectorDecoratorTest, createFilteredFeatureVectorFromViewWithIndicesInverse) { + uint32 numBins = 4; + uint32 numExamplesPerBin = 10; + uint32 numMinorityExamples = numBins * numExamplesPerBin; + AllocatedBinnedFeatureVector featureVector(numBins, numMinorityExamples, 0); + AllocatedBinnedFeatureVector::threshold_iterator thresholdIterator = featureVector.thresholds; + AllocatedBinnedFeatureVector::index_iterator indptrIterator = featureVector.indptr; + AllocatedBinnedFeatureVector::index_iterator indexIterator = featureVector.indices_begin(0); + + for (uint32 i = 0; i < numBins; i++) { + if (i < numBins - 1) { + thresholdIterator[i] = i; + } + indptrIterator[i] = i * numExamplesPerBin; + + for (uint32 j = 0; j < numExamplesPerBin; j++) { + uint32 index = (i * numExamplesPerBin) + j; + indexIterator[index] = index; + } + } + + BinnedFeatureVectorDecorator decorator(std::move(featureVector), AllocatedMissingFeatureVector()); + std::unique_ptr existing; + Interval interval(2, numBins, true); + std::unique_ptr filtered = decorator.createFilteredFeatureVector(existing, Interval(0, numBins)) + ->createFilteredFeatureVector(existing, interval); + const BinnedFeatureVectorView* filteredDecorator = dynamic_cast(filtered.get()); + EXPECT_TRUE(filteredDecorator != nullptr); + + if (filteredDecorator) { + // Check filtered indices... + const BinnedFeatureVector& filteredFeatureVector = filteredDecorator->getView().firstView; + EXPECT_EQ(filteredFeatureVector.numBins, interval.end - interval.start); + BinnedFeatureVector::threshold_const_iterator thresholdsBegin = filteredFeatureVector.thresholds_cbegin(); + + for (uint32 i = 0; i < filteredFeatureVector.numBins; i++) { + EXPECT_EQ(thresholdsBegin[i], (int32) i); + BinnedFeatureVector::index_const_iterator indicesBegin = filteredFeatureVector.indices_cbegin(i); + BinnedFeatureVector::index_const_iterator indicesEnd = filteredFeatureVector.indices_cend(i); + uint32 numIndices = indicesEnd - indicesBegin; + EXPECT_EQ(numIndices, numExamplesPerBin); + + for (uint32 j = 0; j < numIndices; j++) { + EXPECT_EQ(indicesBegin[j], (i * numExamplesPerBin) + j); + } + } + } +} + +TEST(BinnedFeatureVectorDecoratorTest, createFilteredFeatureVectorFromViewWithCoverageMask) { + uint32 numBins = 4; + uint32 numExamplesPerBin = 10; + uint32 numMinorityExamples = numBins * numExamplesPerBin; + AllocatedBinnedFeatureVector featureVector(numBins, numMinorityExamples, 0); + AllocatedBinnedFeatureVector::threshold_iterator thresholdIterator = featureVector.thresholds; + AllocatedBinnedFeatureVector::index_iterator indptrIterator = featureVector.indptr; + AllocatedBinnedFeatureVector::index_iterator indexIterator = featureVector.indices_begin(0); + + for (uint32 i = 0; i < numBins; i++) { + if (i < numBins - 1) { + thresholdIterator[i] = i; + } + + indptrIterator[i] = i * numExamplesPerBin; + + for (uint32 j = 0; j < numExamplesPerBin; j++) { + uint32 index = (i * numExamplesPerBin) + j; + indexIterator[index] = index; + } + } + + CoverageMask coverageMask(numMinorityExamples); + uint32 indicatorValue = 1; + coverageMask.indicatorValue = indicatorValue; + CoverageMask::iterator coverageMaskIterator = coverageMask.begin(); + + for (uint32 i = 0; i < numMinorityExamples; i++) { + if (i % 2 == 0) { + coverageMaskIterator[i] = indicatorValue; + } + } + + BinnedFeatureVectorDecorator decorator(std::move(featureVector), AllocatedMissingFeatureVector()); + std::unique_ptr existing; + std::unique_ptr filtered = decorator.createFilteredFeatureVector(existing, Interval(0, numBins)) + ->createFilteredFeatureVector(existing, coverageMask); + const BinnedFeatureVectorDecorator* filteredDecorator = + dynamic_cast(filtered.get()); + EXPECT_TRUE(filteredDecorator != nullptr); + + if (filteredDecorator) { + // Check filtered indices... + const BinnedFeatureVector& filteredFeatureVector = filteredDecorator->getView().firstView; + + for (uint32 i = 0; i < numBins; i++) { + BinnedFeatureVector::index_const_iterator indicesBegin = filteredFeatureVector.indices_cbegin(i); + BinnedFeatureVector::index_const_iterator indicesEnd = filteredFeatureVector.indices_cend(i); + uint32 numIndices = indicesEnd - indicesBegin; + EXPECT_EQ(numIndices, numExamplesPerBin / 2); + + std::unordered_set indices; + + for (auto it = indicesBegin; it != indicesEnd; it++) { + indices.emplace(*it); + } + + for (uint32 j = 0; j < numExamplesPerBin; j++) { + uint32 index = (i * numExamplesPerBin) + j; + + if (index % 2 == 0) { + EXPECT_TRUE(indices.find(index) != indices.end()); + } else { + EXPECT_TRUE(indices.find(index) == indices.end()); + } + } + } + } +} + +TEST(BinnedFeatureVectorDecoratorTest, createFilteredFeatureVectorFromCoverageMask) { + uint32 numBins = 3; + uint32 numExamplesPerBin = 10; + uint32 numMinorityExamples = numBins * numExamplesPerBin; + AllocatedBinnedFeatureVector featureVector(numBins, numMinorityExamples, 0); + AllocatedBinnedFeatureVector::threshold_iterator thresholdIterator = featureVector.thresholds; + AllocatedBinnedFeatureVector::index_iterator indptrIterator = featureVector.indptr; + AllocatedBinnedFeatureVector::index_iterator indexIterator = featureVector.indices_begin(0); + + for (uint32 i = 0; i < numBins; i++) { + if (i < numBins - 1) { + thresholdIterator[i] = i; + } + + indptrIterator[i] = i * numExamplesPerBin; + + for (uint32 j = 0; j < numExamplesPerBin; j++) { + uint32 index = (i * numExamplesPerBin) + j; + indexIterator[index] = index; + } + } + + AllocatedMissingFeatureVector missingFeatureVector; + uint32 numMissingIndices = 10; + uint32 numExamples = numMinorityExamples + numMissingIndices; + + for (uint32 i = numMinorityExamples; i < numExamples; i++) { + missingFeatureVector.set(i, true); + } + + CoverageMask coverageMask(numExamples); + uint32 indicatorValue = 1; + coverageMask.indicatorValue = indicatorValue; + CoverageMask::iterator coverageMaskIterator = coverageMask.begin(); + + for (uint32 i = 0; i < numExamples; i++) { + if (i % 2 == 0) { + coverageMaskIterator[i] = indicatorValue; + } + } + + BinnedFeatureVectorDecorator decorator(std::move(featureVector), std::move(missingFeatureVector)); + std::unique_ptr existing; + std::unique_ptr filtered = decorator.createFilteredFeatureVector(existing, coverageMask); + const BinnedFeatureVectorDecorator* filteredDecorator = + dynamic_cast(filtered.get()); + EXPECT_TRUE(filteredDecorator != nullptr); + + if (filteredDecorator) { + // Check filtered indices... + const BinnedFeatureVector& filteredFeatureVector = filteredDecorator->getView().firstView; + + for (uint32 i = 0; i < numBins; i++) { + BinnedFeatureVector::index_const_iterator indicesBegin = filteredFeatureVector.indices_cbegin(i); + BinnedFeatureVector::index_const_iterator indicesEnd = filteredFeatureVector.indices_cend(i); + uint32 numIndices = indicesEnd - indicesBegin; + EXPECT_EQ(numIndices, numExamplesPerBin / 2); + + std::unordered_set indices; + + for (auto it = indicesBegin; it != indicesEnd; it++) { + indices.emplace(*it); + } + + for (uint32 j = 0; j < numExamplesPerBin; j++) { + uint32 index = (i * numExamplesPerBin) + j; + + if (index % 2 == 0) { + EXPECT_TRUE(indices.find(index) != indices.end()); + } else { + EXPECT_TRUE(indices.find(index) == indices.end()); + } + } + } + + // Check missing indices... + const MissingFeatureVector& filteredMissingFeatureVector = filteredDecorator->getView().secondView; + + for (uint32 i = numMinorityExamples; i < numExamples; i++) { + if (i % 2 == 0) { + EXPECT_TRUE(filteredMissingFeatureVector[i]); + } else { + EXPECT_FALSE(filteredMissingFeatureVector[i]); + } + } + } +} + +TEST(BinnedFeatureVectorDecoratorTest, createFilteredFeatureVectorFromCoverageMaskUsingExisting) { + uint32 numBins = 3; + uint32 numExamplesPerBin = 10; + uint32 numMinorityExamples = numBins * numExamplesPerBin; + AllocatedBinnedFeatureVector featureVector(numBins, numMinorityExamples, 0); + AllocatedBinnedFeatureVector::threshold_iterator thresholdIterator = featureVector.thresholds; + AllocatedBinnedFeatureVector::index_iterator indptrIterator = featureVector.indptr; + AllocatedBinnedFeatureVector::index_iterator indexIterator = featureVector.indices_begin(0); + + for (uint32 i = 0; i < numBins; i++) { + if (i < numBins - 1) { + thresholdIterator[i] = i; + } + + indptrIterator[i] = i * numExamplesPerBin; + + for (uint32 j = 0; j < numExamplesPerBin; j++) { + uint32 index = (i * numExamplesPerBin) + j; + indexIterator[index] = index; + } + } + + AllocatedMissingFeatureVector missingFeatureVector; + uint32 numMissingIndices = 10; + uint32 numExamples = numMinorityExamples + numMissingIndices; + + for (uint32 i = numMinorityExamples; i < numExamples; i++) { + missingFeatureVector.set(i, true); + } + + CoverageMask coverageMask(numExamples); + uint32 indicatorValue = 1; + coverageMask.indicatorValue = indicatorValue; + CoverageMask::iterator coverageMaskIterator = coverageMask.begin(); + + for (uint32 i = 0; i < numExamples; i++) { + if (i % 2 == 0) { + coverageMaskIterator[i] = indicatorValue; + } + } + + std::unique_ptr existing = + std::make_unique(std::move(featureVector), std::move(missingFeatureVector)); + std::unique_ptr filtered = existing->createFilteredFeatureVector(existing, coverageMask); + const BinnedFeatureVectorDecorator* filteredDecorator = + dynamic_cast(filtered.get()); + EXPECT_TRUE(filteredDecorator != nullptr); + EXPECT_TRUE(existing.get() == nullptr); + + if (filteredDecorator) { + // Check filtered indices... + const BinnedFeatureVector& filteredFeatureVector = filteredDecorator->getView().firstView; + + for (uint32 i = 0; i < numBins; i++) { + BinnedFeatureVector::index_const_iterator indicesBegin = filteredFeatureVector.indices_cbegin(i); + BinnedFeatureVector::index_const_iterator indicesEnd = filteredFeatureVector.indices_cend(i); + uint32 numIndices = indicesEnd - indicesBegin; + EXPECT_EQ(numIndices, numExamplesPerBin / 2); + + std::unordered_set indices; + + for (auto it = indicesBegin; it != indicesEnd; it++) { + indices.emplace(*it); + } + + for (uint32 j = 0; j < numExamplesPerBin; j++) { + uint32 index = (i * numExamplesPerBin) + j; + + if (index % 2 == 0) { + EXPECT_TRUE(indices.find(index) != indices.end()); + } else { + EXPECT_TRUE(indices.find(index) == indices.end()); + } + } + } + + // Check missing indices... + const MissingFeatureVector& filteredMissingFeatureVector = filteredDecorator->getView().secondView; + + for (uint32 i = numMinorityExamples; i < numExamples; i++) { + if (i % 2 == 0) { + EXPECT_TRUE(filteredMissingFeatureVector[i]); + } else { + EXPECT_FALSE(filteredMissingFeatureVector[i]); + } + } + } +} + +TEST(BinnedFeatureVectorDecoratorTest, createFilteredFeatureVectorFromCoverageMaskReturnsEqualFeatureVector) { + uint32 numBins = 1; + uint32 numExamplesPerBin = 10; + uint32 numMinorityExamples = numBins * numExamplesPerBin; + AllocatedBinnedFeatureVector featureVector(1, numMinorityExamples, 0); + AllocatedBinnedFeatureVector::index_iterator indexIterator = featureVector.indices_begin(0); + + for (uint32 i = 0; i < numBins; i++) { + for (uint32 j = 0; j < numExamplesPerBin; j++) { + uint32 index = (i * numExamplesPerBin) + j; + indexIterator[index] = index; + } + } + + CoverageMask coverageMask(numMinorityExamples); + coverageMask.indicatorValue = 1; + + BinnedFeatureVectorDecorator decorator(std::move(featureVector), AllocatedMissingFeatureVector()); + std::unique_ptr existing; + std::unique_ptr filtered = decorator.createFilteredFeatureVector(existing, coverageMask); + const EqualFeatureVector* filteredFeatureVector = dynamic_cast(filtered.get()); + EXPECT_TRUE(filteredFeatureVector != nullptr); +} diff --git a/cpp/subprojects/common/test/mlrl/common/input/feature_vector_decorator_nominal.cpp b/cpp/subprojects/common/test/mlrl/common/input/feature_vector_decorator_nominal.cpp index 41f52a2e5d..512f40638f 100644 --- a/cpp/subprojects/common/test/mlrl/common/input/feature_vector_decorator_nominal.cpp +++ b/cpp/subprojects/common/test/mlrl/common/input/feature_vector_decorator_nominal.cpp @@ -35,13 +35,13 @@ TEST(NominalFeatureVectorDecoratorTest, updateCoverageMaskAndStatistics) { CoverageMask coverageMask(numExamples); uint32 indicatorValue = 1; decorator.updateCoverageMaskAndStatistics(interval, coverageMask, indicatorValue, statistics); - EXPECT_EQ(coverageMask.getIndicatorValue(), indicatorValue); + EXPECT_EQ(coverageMask.indicatorValue, indicatorValue); const NominalFeatureVector& nominalFeatureVector = decorator.getView().firstView; for (uint32 i = 0; i < interval.start; i++) { for (auto it = nominalFeatureVector.indices_cbegin(i); it != nominalFeatureVector.indices_cend(i); it++) { uint32 index = *it; - EXPECT_FALSE(coverageMask.isCovered(index)); + EXPECT_FALSE(coverageMask[index]); EXPECT_FALSE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); } } @@ -49,7 +49,7 @@ TEST(NominalFeatureVectorDecoratorTest, updateCoverageMaskAndStatistics) { for (uint32 i = interval.start; i < interval.end; i++) { for (auto it = nominalFeatureVector.indices_cbegin(i); it != nominalFeatureVector.indices_cend(i); it++) { uint32 index = *it; - EXPECT_TRUE(coverageMask.isCovered(index)); + EXPECT_TRUE(coverageMask[index]); EXPECT_TRUE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); } } @@ -57,13 +57,13 @@ TEST(NominalFeatureVectorDecoratorTest, updateCoverageMaskAndStatistics) { for (uint32 i = interval.end; i < numValues; i++) { for (auto it = nominalFeatureVector.indices_cbegin(i); it != nominalFeatureVector.indices_cend(i); it++) { uint32 index = *it; - EXPECT_FALSE(coverageMask.isCovered(index)); + EXPECT_FALSE(coverageMask[index]); EXPECT_FALSE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); } } for (uint32 i = numMinorityExamples; i < numExamples; i++) { - EXPECT_FALSE(coverageMask.isCovered(i)); + EXPECT_FALSE(coverageMask[i]); EXPECT_FALSE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); } } @@ -106,13 +106,13 @@ TEST(NominalFeatureVectorDecoratorTest, updateCoverageMaskAndStatisticsInverse) CoverageMask coverageMask(numExamples); uint32 indicatorValue = 1; decorator.updateCoverageMaskAndStatistics(interval, coverageMask, indicatorValue, statistics); - EXPECT_EQ(coverageMask.getIndicatorValue(), (uint32) 0); + EXPECT_EQ(coverageMask.indicatorValue, (uint32) 0); const NominalFeatureVector& nominalFeatureVector = decorator.getView().firstView; for (uint32 i = 0; i < interval.start; i++) { for (auto it = nominalFeatureVector.indices_cbegin(i); it != nominalFeatureVector.indices_cend(i); it++) { uint32 index = *it; - EXPECT_TRUE(coverageMask.isCovered(index)); + EXPECT_TRUE(coverageMask[index]); EXPECT_TRUE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); } } @@ -120,7 +120,7 @@ TEST(NominalFeatureVectorDecoratorTest, updateCoverageMaskAndStatisticsInverse) for (uint32 i = interval.start; i < interval.end; i++) { for (auto it = nominalFeatureVector.indices_cbegin(i); it != nominalFeatureVector.indices_cend(i); it++) { uint32 index = *it; - EXPECT_FALSE(coverageMask.isCovered(index)); + EXPECT_FALSE(coverageMask[index]); EXPECT_FALSE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); } } @@ -128,18 +128,18 @@ TEST(NominalFeatureVectorDecoratorTest, updateCoverageMaskAndStatisticsInverse) for (uint32 i = interval.end; i < numValues; i++) { for (auto it = nominalFeatureVector.indices_cbegin(i); it != nominalFeatureVector.indices_cend(i); it++) { uint32 index = *it; - EXPECT_TRUE(coverageMask.isCovered(index)); + EXPECT_TRUE(coverageMask[index]); EXPECT_TRUE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); } } for (uint32 i = numMinorityExamples; i < numMinorityExamples + numMissingExamples; i++) { - EXPECT_FALSE(coverageMask.isCovered(i)); + EXPECT_FALSE(coverageMask[i]); EXPECT_FALSE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); } for (uint32 i = numMinorityExamples + numMissingExamples; i < numExamples; i++) { - EXPECT_TRUE(coverageMask.isCovered(i)); + EXPECT_TRUE(coverageMask[i]); EXPECT_TRUE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); } } @@ -327,7 +327,7 @@ TEST(NominalFeatureVectorDecoratorTest, createFilteredFeatureVectorFromCoverageM CoverageMask coverageMask(numExamples); uint32 indicatorValue = 1; - coverageMask.setIndicatorValue(indicatorValue); + coverageMask.indicatorValue = indicatorValue; CoverageMask::iterator coverageMaskIterator = coverageMask.begin(); for (uint32 i = 0; i < numExamples; i++) { @@ -412,7 +412,7 @@ TEST(NominalFeatureVectorDecoratorTest, createFilteredFeatureVectorFromCoverageM CoverageMask coverageMask(numExamples); uint32 indicatorValue = 1; - coverageMask.setIndicatorValue(indicatorValue); + coverageMask.indicatorValue = indicatorValue; CoverageMask::iterator coverageMaskIterator = coverageMask.begin(); for (uint32 i = 0; i < numExamples; i++) { @@ -484,7 +484,7 @@ TEST(NominalFeatureVectorDecoratorTest, createFilteredFeatureVectorFromCoverageM } CoverageMask coverageMask(numMinorityExamples); - coverageMask.setIndicatorValue(1); + coverageMask.indicatorValue = 1; NominalFeatureVectorDecorator decorator(std::move(featureVector), AllocatedMissingFeatureVector()); std::unique_ptr existing; diff --git a/cpp/subprojects/common/test/mlrl/common/input/feature_vector_decorator_numerical.cpp b/cpp/subprojects/common/test/mlrl/common/input/feature_vector_decorator_numerical.cpp index bfaf9a6d0a..2fd5cc3f64 100644 --- a/cpp/subprojects/common/test/mlrl/common/input/feature_vector_decorator_numerical.cpp +++ b/cpp/subprojects/common/test/mlrl/common/input/feature_vector_decorator_numerical.cpp @@ -26,20 +26,20 @@ TEST(NumericalFeatureVectorDecoratorTest, updateCoverageMaskAndStatistics) { CoverageMask coverageMask(numDenseExamples); uint32 indicatorValue = 1; decorator.updateCoverageMaskAndStatistics(interval, coverageMask, indicatorValue, statistics); - EXPECT_EQ(coverageMask.getIndicatorValue(), indicatorValue); + EXPECT_EQ(coverageMask.indicatorValue, indicatorValue); for (uint32 i = 0; i < interval.start; i++) { - EXPECT_FALSE(coverageMask.isCovered(i)); + EXPECT_FALSE(coverageMask[i]); EXPECT_FALSE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); } for (uint32 i = interval.start; i < interval.end; i++) { - EXPECT_TRUE(coverageMask.isCovered(i)); + EXPECT_TRUE(coverageMask[i]); EXPECT_TRUE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); } for (uint32 i = interval.end; i < numDenseExamples; i++) { - EXPECT_FALSE(coverageMask.isCovered(i)); + EXPECT_FALSE(coverageMask[i]); EXPECT_FALSE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); } } @@ -74,20 +74,20 @@ TEST(NumericalFeatureVectorDecoratorTest, updateCoverageMaskAndStatisticsInverse CoverageMask coverageMask(numExamples); uint32 indicatorValue = 1; decorator.updateCoverageMaskAndStatistics(interval, coverageMask, indicatorValue, statistics); - EXPECT_EQ(coverageMask.getIndicatorValue(), (uint32) 0); + EXPECT_EQ(coverageMask.indicatorValue, (uint32) 0); for (uint32 i = 0; i < interval.start; i++) { - EXPECT_TRUE(coverageMask.isCovered(i)); + EXPECT_TRUE(coverageMask[i]); EXPECT_TRUE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); } for (uint32 i = interval.start; i < interval.end; i++) { - EXPECT_FALSE(coverageMask.isCovered(i)); + EXPECT_FALSE(coverageMask[i]); EXPECT_FALSE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); } for (uint32 i = interval.end; i < numDenseExamples; i++) { - EXPECT_TRUE(coverageMask.isCovered(i)); + EXPECT_TRUE(coverageMask[i]); EXPECT_TRUE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); } @@ -121,20 +121,20 @@ TEST(NumericalFeatureVectorDecoratorTest, updateCoverageMaskAndStatisticsFromVie CoverageMask coverageMask(numDenseExamples); uint32 indicatorValue = 1; filtered->updateCoverageMaskAndStatistics(interval, coverageMask, indicatorValue, statistics); - EXPECT_EQ(coverageMask.getIndicatorValue(), indicatorValue); + EXPECT_EQ(coverageMask.indicatorValue, indicatorValue); for (uint32 i = 0; i < interval.start; i++) { - EXPECT_FALSE(coverageMask.isCovered(i)); + EXPECT_FALSE(coverageMask[i]); EXPECT_FALSE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); } for (uint32 i = interval.start; i < interval.end; i++) { - EXPECT_TRUE(coverageMask.isCovered(i)); + EXPECT_TRUE(coverageMask[i]); EXPECT_TRUE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); } for (uint32 i = interval.end; i < numDenseExamples; i++) { - EXPECT_FALSE(coverageMask.isCovered(i)); + EXPECT_FALSE(coverageMask[i]); EXPECT_FALSE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); } } @@ -164,20 +164,20 @@ TEST(NumericalFeatureVectorDecoratorTest, updateCoverageMaskAndStatisticsFromVie CoverageMask coverageMask(numDenseExamples); uint32 indicatorValue = 1; filtered->updateCoverageMaskAndStatistics(interval, coverageMask, indicatorValue, statistics); - EXPECT_EQ(coverageMask.getIndicatorValue(), (uint32) 0); + EXPECT_EQ(coverageMask.indicatorValue, (uint32) 0); for (uint32 i = 0; i < interval.start; i++) { - EXPECT_TRUE(coverageMask.isCovered(i)); + EXPECT_TRUE(coverageMask[i]); EXPECT_TRUE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); } for (uint32 i = interval.start; i < interval.end; i++) { - EXPECT_FALSE(coverageMask.isCovered(i)); + EXPECT_FALSE(coverageMask[i]); EXPECT_FALSE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); } for (uint32 i = interval.end; i < numDenseExamples; i++) { - EXPECT_TRUE(coverageMask.isCovered(i)); + EXPECT_TRUE(coverageMask[i]); EXPECT_TRUE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); } } @@ -368,7 +368,7 @@ TEST(NumericalFeatureVectorDecoratorTest, createFilteredFeatureVectorFromViewWit CoverageMask coverageMask(numDenseExamples); uint32 indicatorValue = 1; - coverageMask.setIndicatorValue(indicatorValue); + coverageMask.indicatorValue = indicatorValue; CoverageMask::iterator coverageMaskIterator = coverageMask.begin(); for (uint32 i = 0; i < numDenseExamples; i++) { @@ -430,7 +430,7 @@ TEST(NumericalFeatureVectorDecoratorTest, createFilteredFeatureVectorFromCoverag CoverageMask coverageMask(numExamples); uint32 indicatorValue = 1; - coverageMask.setIndicatorValue(indicatorValue); + coverageMask.indicatorValue = indicatorValue; CoverageMask::iterator coverageMaskIterator = coverageMask.begin(); for (uint32 i = 0; i < numExamples; i++) { @@ -498,7 +498,7 @@ TEST(NumericalFeatureVectorDecoratorTest, createFilteredFeatureVectorFromCoverag CoverageMask coverageMask(numExamples); uint32 indicatorValue = 1; - coverageMask.setIndicatorValue(indicatorValue); + coverageMask.indicatorValue = indicatorValue; CoverageMask::iterator coverageMaskIterator = coverageMask.begin(); for (uint32 i = 0; i < numExamples; i++) { @@ -559,7 +559,7 @@ TEST(NumericalFeatureVectorDecoratorTest, createFilteredFeatureVectorFromCoverag } CoverageMask coverageMask(numDenseExamples); - coverageMask.setIndicatorValue(1); + coverageMask.indicatorValue = 1; NumericalFeatureVectorDecorator decorator(std::move(featureVector), AllocatedMissingFeatureVector()); std::unique_ptr existing; diff --git a/cpp/subprojects/common/test/mlrl/common/input/feature_vector_decorator_ordinal.cpp b/cpp/subprojects/common/test/mlrl/common/input/feature_vector_decorator_ordinal.cpp index 8d30bb0463..8312a43db1 100644 --- a/cpp/subprojects/common/test/mlrl/common/input/feature_vector_decorator_ordinal.cpp +++ b/cpp/subprojects/common/test/mlrl/common/input/feature_vector_decorator_ordinal.cpp @@ -36,13 +36,13 @@ TEST(OrdinalFeatureVectorDecoratorTest, updateCoverageMaskAndStatistics) { CoverageMask coverageMask(numExamples); uint32 indicatorValue = 1; decorator.updateCoverageMaskAndStatistics(interval, coverageMask, indicatorValue, statistics); - EXPECT_EQ(coverageMask.getIndicatorValue(), indicatorValue); + EXPECT_EQ(coverageMask.indicatorValue, indicatorValue); const OrdinalFeatureVector& ordinalFeatureVector = decorator.getView().firstView; for (uint32 i = 0; i < interval.start; i++) { for (auto it = ordinalFeatureVector.indices_cbegin(i); it != ordinalFeatureVector.indices_cend(i); it++) { uint32 index = *it; - EXPECT_FALSE(coverageMask.isCovered(index)); + EXPECT_FALSE(coverageMask[index]); EXPECT_FALSE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); } } @@ -50,7 +50,7 @@ TEST(OrdinalFeatureVectorDecoratorTest, updateCoverageMaskAndStatistics) { for (uint32 i = interval.start; i < interval.end; i++) { for (auto it = ordinalFeatureVector.indices_cbegin(i); it != ordinalFeatureVector.indices_cend(i); it++) { uint32 index = *it; - EXPECT_TRUE(coverageMask.isCovered(index)); + EXPECT_TRUE(coverageMask[index]); EXPECT_TRUE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); } } @@ -58,13 +58,13 @@ TEST(OrdinalFeatureVectorDecoratorTest, updateCoverageMaskAndStatistics) { for (uint32 i = interval.end; i < numValues; i++) { for (auto it = ordinalFeatureVector.indices_cbegin(i); it != ordinalFeatureVector.indices_cend(i); it++) { uint32 index = *it; - EXPECT_FALSE(coverageMask.isCovered(index)); + EXPECT_FALSE(coverageMask[index]); EXPECT_FALSE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); } } for (uint32 i = numMinorityExamples; i < numExamples; i++) { - EXPECT_FALSE(coverageMask.isCovered(i)); + EXPECT_FALSE(coverageMask[i]); EXPECT_FALSE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); } } @@ -107,13 +107,13 @@ TEST(OrdinalFeatureVectorDecoratorTest, updateCoverageMaskAndStatisticsInverse) CoverageMask coverageMask(numExamples); uint32 indicatorValue = 1; decorator.updateCoverageMaskAndStatistics(interval, coverageMask, indicatorValue, statistics); - EXPECT_EQ(coverageMask.getIndicatorValue(), (uint32) 0); + EXPECT_EQ(coverageMask.indicatorValue, (uint32) 0); const OrdinalFeatureVector& ordinalFeatureVector = decorator.getView().firstView; for (uint32 i = 0; i < interval.start; i++) { for (auto it = ordinalFeatureVector.indices_cbegin(i); it != ordinalFeatureVector.indices_cend(i); it++) { uint32 index = *it; - EXPECT_TRUE(coverageMask.isCovered(index)); + EXPECT_TRUE(coverageMask[index]); EXPECT_TRUE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); } } @@ -121,7 +121,7 @@ TEST(OrdinalFeatureVectorDecoratorTest, updateCoverageMaskAndStatisticsInverse) for (uint32 i = interval.start; i < interval.end; i++) { for (auto it = ordinalFeatureVector.indices_cbegin(i); it != ordinalFeatureVector.indices_cend(i); it++) { uint32 index = *it; - EXPECT_FALSE(coverageMask.isCovered(index)); + EXPECT_FALSE(coverageMask[index]); EXPECT_FALSE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); } } @@ -129,18 +129,18 @@ TEST(OrdinalFeatureVectorDecoratorTest, updateCoverageMaskAndStatisticsInverse) for (uint32 i = interval.end; i < numValues; i++) { for (auto it = ordinalFeatureVector.indices_cbegin(i); it != ordinalFeatureVector.indices_cend(i); it++) { uint32 index = *it; - EXPECT_TRUE(coverageMask.isCovered(index)); + EXPECT_TRUE(coverageMask[index]); EXPECT_TRUE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); } } for (uint32 i = numMinorityExamples; i < numMinorityExamples + numMissingExamples; i++) { - EXPECT_FALSE(coverageMask.isCovered(i)); + EXPECT_FALSE(coverageMask[i]); EXPECT_FALSE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); } for (uint32 i = numMinorityExamples + numMissingExamples; i < numExamples; i++) { - EXPECT_TRUE(coverageMask.isCovered(i)); + EXPECT_TRUE(coverageMask[i]); EXPECT_TRUE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); } } @@ -178,13 +178,13 @@ TEST(OrdinalFeatureVectorDecoratorTest, updateCoverageMaskAndStatisticsFromView) std::unique_ptr existing; decorator.createFilteredFeatureVector(existing, Interval(0, numValues)) ->updateCoverageMaskAndStatistics(interval, coverageMask, indicatorValue, statistics); - EXPECT_EQ(coverageMask.getIndicatorValue(), indicatorValue); + EXPECT_EQ(coverageMask.indicatorValue, indicatorValue); const OrdinalFeatureVector& ordinalFeatureVector = decorator.getView().firstView; for (uint32 i = 0; i < interval.start; i++) { for (auto it = ordinalFeatureVector.indices_cbegin(i); it != ordinalFeatureVector.indices_cend(i); it++) { uint32 index = *it; - EXPECT_FALSE(coverageMask.isCovered(index)); + EXPECT_FALSE(coverageMask[index]); EXPECT_FALSE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); } } @@ -192,7 +192,7 @@ TEST(OrdinalFeatureVectorDecoratorTest, updateCoverageMaskAndStatisticsFromView) for (uint32 i = interval.start; i < interval.end; i++) { for (auto it = ordinalFeatureVector.indices_cbegin(i); it != ordinalFeatureVector.indices_cend(i); it++) { uint32 index = *it; - EXPECT_TRUE(coverageMask.isCovered(index)); + EXPECT_TRUE(coverageMask[index]); EXPECT_TRUE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); } } @@ -200,13 +200,13 @@ TEST(OrdinalFeatureVectorDecoratorTest, updateCoverageMaskAndStatisticsFromView) for (uint32 i = interval.end; i < numValues; i++) { for (auto it = ordinalFeatureVector.indices_cbegin(i); it != ordinalFeatureVector.indices_cend(i); it++) { uint32 index = *it; - EXPECT_FALSE(coverageMask.isCovered(index)); + EXPECT_FALSE(coverageMask[index]); EXPECT_FALSE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); } } for (uint32 i = numMinorityExamples; i < numExamples; i++) { - EXPECT_FALSE(coverageMask.isCovered(i)); + EXPECT_FALSE(coverageMask[i]); EXPECT_FALSE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); } } @@ -244,13 +244,13 @@ TEST(OrdinalFeatureVectorDecoratorTest, updateCoverageMaskAndStatisticsFromViewI std::unique_ptr existing; decorator.createFilteredFeatureVector(existing, Interval(0, numValues)) ->updateCoverageMaskAndStatistics(interval, coverageMask, indicatorValue, statistics); - EXPECT_EQ(coverageMask.getIndicatorValue(), (uint32) 0); + EXPECT_EQ(coverageMask.indicatorValue, (uint32) 0); const OrdinalFeatureVector& ordinalFeatureVector = decorator.getView().firstView; for (uint32 i = 0; i < interval.start; i++) { for (auto it = ordinalFeatureVector.indices_cbegin(i); it != ordinalFeatureVector.indices_cend(i); it++) { uint32 index = *it; - EXPECT_TRUE(coverageMask.isCovered(index)); + EXPECT_TRUE(coverageMask[index]); EXPECT_TRUE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); } } @@ -258,7 +258,7 @@ TEST(OrdinalFeatureVectorDecoratorTest, updateCoverageMaskAndStatisticsFromViewI for (uint32 i = interval.start; i < interval.end; i++) { for (auto it = ordinalFeatureVector.indices_cbegin(i); it != ordinalFeatureVector.indices_cend(i); it++) { uint32 index = *it; - EXPECT_FALSE(coverageMask.isCovered(index)); + EXPECT_FALSE(coverageMask[index]); EXPECT_FALSE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); } } @@ -266,13 +266,13 @@ TEST(OrdinalFeatureVectorDecoratorTest, updateCoverageMaskAndStatisticsFromViewI for (uint32 i = interval.end; i < numValues; i++) { for (auto it = ordinalFeatureVector.indices_cbegin(i); it != ordinalFeatureVector.indices_cend(i); it++) { uint32 index = *it; - EXPECT_TRUE(coverageMask.isCovered(index)); + EXPECT_TRUE(coverageMask[index]); EXPECT_TRUE(statistics.coveredStatistics.find(index) != statistics.coveredStatistics.end()); } } for (uint32 i = numMinorityExamples; i < numExamples; i++) { - EXPECT_TRUE(coverageMask.isCovered(i)); + EXPECT_TRUE(coverageMask[i]); EXPECT_TRUE(statistics.coveredStatistics.find(i) != statistics.coveredStatistics.end()); } } @@ -484,7 +484,7 @@ TEST(OrdinalFeatureVectorDecoratorTest, createFilteredFeatureVectorFromViewWithC CoverageMask coverageMask(numMinorityExamples); uint32 indicatorValue = 1; - coverageMask.setIndicatorValue(indicatorValue); + coverageMask.indicatorValue = indicatorValue; CoverageMask::iterator coverageMaskIterator = coverageMask.begin(); for (uint32 i = 0; i < numMinorityExamples; i++) { @@ -559,7 +559,7 @@ TEST(OrdinalFeatureVectorDecoratorTest, createFilteredFeatureVectorFromCoverageM CoverageMask coverageMask(numExamples); uint32 indicatorValue = 1; - coverageMask.setIndicatorValue(indicatorValue); + coverageMask.indicatorValue = indicatorValue; CoverageMask::iterator coverageMaskIterator = coverageMask.begin(); for (uint32 i = 0; i < numExamples; i++) { @@ -644,7 +644,7 @@ TEST(OrdinalFeatureVectorDecoratorTest, createFilteredFeatureVectorFromCoverageM CoverageMask coverageMask(numExamples); uint32 indicatorValue = 1; - coverageMask.setIndicatorValue(indicatorValue); + coverageMask.indicatorValue = indicatorValue; CoverageMask::iterator coverageMaskIterator = coverageMask.begin(); for (uint32 i = 0; i < numExamples; i++) { @@ -716,7 +716,7 @@ TEST(OrdinalFeatureVectorDecoratorTest, createFilteredFeatureVectorFromCoverageM } CoverageMask coverageMask(numMinorityExamples); - coverageMask.setIndicatorValue(1); + coverageMask.indicatorValue = 1; OrdinalFeatureVectorDecorator decorator(std::move(featureVector), AllocatedMissingFeatureVector()); std::unique_ptr existing; diff --git a/cpp/subprojects/common/test/mlrl/common/input/statistics_weighted.hpp b/cpp/subprojects/common/test/mlrl/common/input/statistics_weighted.hpp index 593efe3aaa..8483e7400d 100644 --- a/cpp/subprojects/common/test/mlrl/common/input/statistics_weighted.hpp +++ b/cpp/subprojects/common/test/mlrl/common/input/statistics_weighted.hpp @@ -46,14 +46,4 @@ class WeightedStatistics final : public IWeightedStatistics { void removeCoveredStatistic(uint32 statisticIndex) override { coveredStatistics.erase(statisticIndex); } - - std::unique_ptr createHistogram(const DenseBinIndexVector& binIndexVector, - uint32 numBins) const override { - throw std::runtime_error("not implemented"); - } - - std::unique_ptr createHistogram(const DokBinIndexVector& binIndexVector, - uint32 numBins) const override { - throw std::runtime_error("not implemented"); - } }; diff --git a/cpp/subprojects/seco/src/mlrl/seco/statistics/statistics_label_wise_common.hpp b/cpp/subprojects/seco/src/mlrl/seco/statistics/statistics_label_wise_common.hpp index ab800b7e77..d28ad43c28 100644 --- a/cpp/subprojects/seco/src/mlrl/seco/statistics/statistics_label_wise_common.hpp +++ b/cpp/subprojects/seco/src/mlrl/seco/statistics/statistics_label_wise_common.hpp @@ -528,24 +528,6 @@ namespace seco { const PartialIndexVector& labelIndices) const override { return std::make_unique>(*this, labelIndices); } - - /** - * @see `IWeightedStatistics::createHistogram` - */ - std::unique_ptr createHistogram(const DenseBinIndexVector& binIndexVector, - uint32 numBins) const override { - // TODO Support creation of histograms - return nullptr; - } - - /** - * @see `IWeightedStatistics::createHistogram` - */ - std::unique_ptr createHistogram(const DokBinIndexVector& binIndexVector, - uint32 numBins) const override { - // TODO Support creation of histograms - return nullptr; - } }; template diff --git a/doc/user_guide/boosting/parameters.md b/doc/user_guide/boosting/parameters.md index 0409745e7d..66d4804d39 100644 --- a/doc/user_guide/boosting/parameters.md +++ b/doc/user_guide/boosting/parameters.md @@ -241,13 +241,13 @@ The following parameters may be used to control the behavior of the algorithm. T - `'equal-width'` Examples are assigned to bins, based on their feature values, according to the equal-width binning method. The following options may be provided using the {ref}`bracket-notation`: - - `bin_ratio` (Default value = `0.33`) A percentage that specifies how many bins should be used. For example, a value of 0.3 means that the number of bins should be set to 30% of the number of distinct values for a feature. + - `bin_ratio` (Default value = `0.33`) A percentage that specifies how many bins should be used. For example, a value of 0.3 means that the number of bins should be set to 30% of the total number of available training examples. - `min_bins` (Default value = `2`) The minimum number of bins. Must be at least 2. - `max_bins` (Default value = `0`) The maximum number of bins. Must be at least min_bins or 0, if the number of bins should not be restricted. - `'equal-frequency'`. Examples are assigned to bins, based on their feature values, according to the equal-frequency binning method. The following options may be provided using the {ref}`bracket-notation`: - - `bin_ratio` (Default value = `0.33`) A percentage that specifies how many bins should be used. For example, a value of 0.3 means that the number of bins should be set to 30% of the number of distinct values for a feature. + - `bin_ratio` (Default value = `0.33`) A percentage that specifies how many bins should be used. For example, a value of 0.3 means that the number of bins should be set to 30% of the total number of available training examples. - `min_bins` (Default value = `2`) The minimum number of bins. Must be at least 2. - `max_bins` (Default value = `0`) The maximum number of bins. Must be at least min_bins or 0, if the number of bins should not be restricted. diff --git a/python/subprojects/common/mlrl/common/cython/feature_binning.pxd b/python/subprojects/common/mlrl/common/cython/feature_binning.pxd index 07c28256a2..00914f4778 100644 --- a/python/subprojects/common/mlrl/common/cython/feature_binning.pxd +++ b/python/subprojects/common/mlrl/common/cython/feature_binning.pxd @@ -1,7 +1,7 @@ from mlrl.common.cython._types cimport float32, uint32 -cdef extern from "mlrl/common/binning/feature_binning_equal_width.hpp" nogil: +cdef extern from "mlrl/common/input/feature_binning_equal_width.hpp" nogil: cdef cppclass IEqualWidthFeatureBinningConfig: @@ -20,7 +20,7 @@ cdef extern from "mlrl/common/binning/feature_binning_equal_width.hpp" nogil: uint32 getMaxBins() const -cdef extern from "mlrl/common/binning/feature_binning_equal_frequency.hpp" nogil: +cdef extern from "mlrl/common/input/feature_binning_equal_frequency.hpp" nogil: cdef cppclass IEqualFrequencyFeatureBinningConfig: diff --git a/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-frequency_binary-features-dense.txt b/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-frequency_binary-features-dense.txt index 6b545ff8e0..db6fe9a443 100644 --- a/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-frequency_binary-features-dense.txt +++ b/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-frequency_binary-features-dense.txt @@ -12,20 +12,20 @@ DEBUG A sparse matrix is used to store the predicted labels INFO Successfully predicted in INFO Evaluation result for test data: -Example-wise F1 62.02 -Example-wise Jaccard 29.62 +Example-wise F1 62.64 +Example-wise Jaccard 31.28 Example-wise Precision 61.51 -Example-wise Recall 33.49 -Hamming Accuracy 94.62 -Hamming Loss 5.38 -Macro F1 12.52 -Macro Jaccard 6.73 -Macro Precision 92.05 -Macro Recall 8.11 -Micro F1 45.02 -Micro Jaccard 29.05 -Micro Precision 64.19 -Micro Recall 34.67 +Example-wise Recall 36.12 +Hamming Accuracy 94.68 +Hamming Loss 5.32 +Macro F1 14.35 +Macro Jaccard 7.81 +Macro Precision 88.88 +Macro Recall 9.27 +Micro F1 47.2 +Micro Jaccard 30.89 +Micro Precision 63.9 +Micro Recall 37.42 Subset 0/1 Loss 96.98 Subset Accuracy 3.02 diff --git a/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-frequency_binary-features-sparse.txt b/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-frequency_binary-features-sparse.txt index 7a6d9a3ac0..ace0681ecc 100644 --- a/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-frequency_binary-features-sparse.txt +++ b/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-frequency_binary-features-sparse.txt @@ -12,20 +12,20 @@ DEBUG A sparse matrix is used to store the predicted labels INFO Successfully predicted in INFO Evaluation result for test data: -Example-wise F1 61.06 -Example-wise Jaccard 29.58 -Example-wise Precision 62.75 -Example-wise Recall 33.2 -Hamming Accuracy 94.62 -Hamming Loss 5.38 -Macro F1 9.81 -Macro Jaccard 6.25 -Macro Precision 94.94 -Macro Recall 7.58 -Micro F1 44.85 -Micro Jaccard 28.91 -Micro Precision 64.39 -Micro Recall 34.41 +Example-wise F1 62.64 +Example-wise Jaccard 31.28 +Example-wise Precision 61.51 +Example-wise Recall 36.12 +Hamming Accuracy 94.68 +Hamming Loss 5.32 +Macro F1 14.35 +Macro Jaccard 7.81 +Macro Precision 88.88 +Macro Recall 9.27 +Micro F1 47.2 +Micro Jaccard 30.89 +Micro Precision 63.9 +Micro Recall 37.42 Subset 0/1 Loss 96.98 Subset Accuracy 3.02 diff --git a/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-frequency_nominal-features-dense.txt b/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-frequency_nominal-features-dense.txt index 21c06a8213..75fc6f45cd 100644 --- a/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-frequency_nominal-features-dense.txt +++ b/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-frequency_nominal-features-dense.txt @@ -12,21 +12,21 @@ DEBUG A dense matrix is used to store the predicted labels INFO Successfully predicted in INFO Evaluation result for test data: -Example-wise F1 71.82 -Example-wise Jaccard 49.49 -Example-wise Precision 71.85 -Example-wise Recall 57.74 -Hamming Accuracy 77.81 -Hamming Loss 22.19 -Macro F1 60.28 -Macro Jaccard 43.89 -Macro Precision 67.51 -Macro Recall 55.34 -Micro F1 62.01 -Micro Jaccard 44.94 -Micro Precision 68.27 -Micro Recall 56.8 -Subset 0/1 Loss 75 -Subset Accuracy 25 +Example-wise F1 71.67 +Example-wise Jaccard 49.15 +Example-wise Precision 73.64 +Example-wise Recall 56.8 +Hamming Accuracy 78.4 +Hamming Loss 21.6 +Macro F1 59.79 +Macro Jaccard 43.82 +Macro Precision 69.16 +Macro Recall 53.66 +Micro F1 62.09 +Micro Jaccard 45.02 +Micro Precision 70.51 +Micro Recall 55.47 +Subset 0/1 Loss 76.02 +Subset Accuracy 23.98 INFO Successfully finished after diff --git a/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-frequency_nominal-features-sparse.txt b/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-frequency_nominal-features-sparse.txt index e003954c4e..2e59815687 100644 --- a/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-frequency_nominal-features-sparse.txt +++ b/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-frequency_nominal-features-sparse.txt @@ -12,21 +12,21 @@ DEBUG A dense matrix is used to store the predicted labels INFO Successfully predicted in INFO Evaluation result for test data: -Example-wise F1 70.36 -Example-wise Jaccard 48.17 -Example-wise Precision 73.89 -Example-wise Recall 55.53 -Hamming Accuracy 77.72 -Hamming Loss 22.28 -Macro F1 58.29 -Macro Jaccard 42.16 -Macro Precision 68.8 -Macro Recall 51.65 -Micro F1 60.42 -Micro Jaccard 43.29 -Micro Precision 69.69 -Micro Recall 53.33 -Subset 0/1 Loss 75.51 -Subset Accuracy 24.49 +Example-wise F1 67.86 +Example-wise Jaccard 46.6 +Example-wise Precision 76.79 +Example-wise Recall 53.32 +Hamming Accuracy 78.74 +Hamming Loss 21.26 +Macro F1 58.93 +Macro Jaccard 42.71 +Macro Precision 73.26 +Macro Recall 51.04 +Micro F1 61.18 +Micro Jaccard 44.07 +Micro Precision 73.23 +Micro Recall 52.53 +Subset 0/1 Loss 77.55 +Subset Accuracy 22.45 INFO Successfully finished after diff --git a/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-frequency_numerical-features-dense.txt b/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-frequency_numerical-features-dense.txt index 1c1bc1b52a..2bbcc0a900 100644 --- a/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-frequency_numerical-features-dense.txt +++ b/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-frequency_numerical-features-dense.txt @@ -12,21 +12,21 @@ DEBUG A dense matrix is used to store the predicted labels INFO Successfully predicted in INFO Evaluation result for test data: -Example-wise F1 68.16 -Example-wise Jaccard 50.55 -Example-wise Precision 79.59 -Example-wise Recall 57.74 -Hamming Accuracy 80.27 -Hamming Loss 19.73 -Macro F1 61.71 -Macro Jaccard 46.62 -Macro Precision 73.01 -Macro Recall 55.9 -Micro F1 65.06 -Micro Jaccard 48.21 -Micro Precision 74.74 -Micro Recall 57.6 -Subset 0/1 Loss 74.49 -Subset Accuracy 25.51 +Example-wise F1 66.68 +Example-wise Jaccard 49.57 +Example-wise Precision 80.61 +Example-wise Recall 56.46 +Hamming Accuracy 80.1 +Hamming Loss 19.9 +Macro F1 59.83 +Macro Jaccard 44.79 +Macro Precision 74.18 +Macro Recall 53.49 +Micro F1 64 +Micro Jaccard 47.06 +Micro Precision 75.64 +Micro Recall 55.47 +Subset 0/1 Loss 75 +Subset Accuracy 25 INFO Successfully finished after diff --git a/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-frequency_numerical-features-sparse.txt b/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-frequency_numerical-features-sparse.txt index b4eced308f..29d8316f25 100644 --- a/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-frequency_numerical-features-sparse.txt +++ b/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-frequency_numerical-features-sparse.txt @@ -12,21 +12,21 @@ DEBUG A dense matrix is used to store the predicted labels INFO Successfully predicted in INFO Evaluation result for test data: -Example-wise F1 66.56 -Example-wise Jaccard 49.53 -Example-wise Precision 80.27 -Example-wise Recall 57.57 -Hamming Accuracy 80.36 -Hamming Loss 19.64 -Macro F1 63.53 -Macro Jaccard 47.3 -Macro Precision 76 -Macro Recall 56.14 -Micro F1 64.95 -Micro Jaccard 48.09 -Micro Precision 75.35 -Micro Recall 57.07 -Subset 0/1 Loss 76.53 -Subset Accuracy 23.47 +Example-wise F1 66.68 +Example-wise Jaccard 48.81 +Example-wise Precision 79.42 +Example-wise Recall 55.7 +Hamming Accuracy 79.59 +Hamming Loss 20.41 +Macro F1 61.17 +Macro Jaccard 45.09 +Macro Precision 73.55 +Macro Recall 54.13 +Micro F1 63.53 +Micro Jaccard 46.55 +Micro Precision 73.85 +Micro Recall 55.73 +Subset 0/1 Loss 75.51 +Subset Accuracy 24.49 INFO Successfully finished after diff --git a/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-width_binary-features-dense.txt b/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-width_binary-features-dense.txt index 6b545ff8e0..db6fe9a443 100644 --- a/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-width_binary-features-dense.txt +++ b/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-width_binary-features-dense.txt @@ -12,20 +12,20 @@ DEBUG A sparse matrix is used to store the predicted labels INFO Successfully predicted in INFO Evaluation result for test data: -Example-wise F1 62.02 -Example-wise Jaccard 29.62 +Example-wise F1 62.64 +Example-wise Jaccard 31.28 Example-wise Precision 61.51 -Example-wise Recall 33.49 -Hamming Accuracy 94.62 -Hamming Loss 5.38 -Macro F1 12.52 -Macro Jaccard 6.73 -Macro Precision 92.05 -Macro Recall 8.11 -Micro F1 45.02 -Micro Jaccard 29.05 -Micro Precision 64.19 -Micro Recall 34.67 +Example-wise Recall 36.12 +Hamming Accuracy 94.68 +Hamming Loss 5.32 +Macro F1 14.35 +Macro Jaccard 7.81 +Macro Precision 88.88 +Macro Recall 9.27 +Micro F1 47.2 +Micro Jaccard 30.89 +Micro Precision 63.9 +Micro Recall 37.42 Subset 0/1 Loss 96.98 Subset Accuracy 3.02 diff --git a/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-width_binary-features-sparse.txt b/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-width_binary-features-sparse.txt index 7a6d9a3ac0..ace0681ecc 100644 --- a/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-width_binary-features-sparse.txt +++ b/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-width_binary-features-sparse.txt @@ -12,20 +12,20 @@ DEBUG A sparse matrix is used to store the predicted labels INFO Successfully predicted in INFO Evaluation result for test data: -Example-wise F1 61.06 -Example-wise Jaccard 29.58 -Example-wise Precision 62.75 -Example-wise Recall 33.2 -Hamming Accuracy 94.62 -Hamming Loss 5.38 -Macro F1 9.81 -Macro Jaccard 6.25 -Macro Precision 94.94 -Macro Recall 7.58 -Micro F1 44.85 -Micro Jaccard 28.91 -Micro Precision 64.39 -Micro Recall 34.41 +Example-wise F1 62.64 +Example-wise Jaccard 31.28 +Example-wise Precision 61.51 +Example-wise Recall 36.12 +Hamming Accuracy 94.68 +Hamming Loss 5.32 +Macro F1 14.35 +Macro Jaccard 7.81 +Macro Precision 88.88 +Macro Recall 9.27 +Micro F1 47.2 +Micro Jaccard 30.89 +Micro Precision 63.9 +Micro Recall 37.42 Subset 0/1 Loss 96.98 Subset Accuracy 3.02 diff --git a/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-width_nominal-features-dense.txt b/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-width_nominal-features-dense.txt index 21c06a8213..75fc6f45cd 100644 --- a/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-width_nominal-features-dense.txt +++ b/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-width_nominal-features-dense.txt @@ -12,21 +12,21 @@ DEBUG A dense matrix is used to store the predicted labels INFO Successfully predicted in INFO Evaluation result for test data: -Example-wise F1 71.82 -Example-wise Jaccard 49.49 -Example-wise Precision 71.85 -Example-wise Recall 57.74 -Hamming Accuracy 77.81 -Hamming Loss 22.19 -Macro F1 60.28 -Macro Jaccard 43.89 -Macro Precision 67.51 -Macro Recall 55.34 -Micro F1 62.01 -Micro Jaccard 44.94 -Micro Precision 68.27 -Micro Recall 56.8 -Subset 0/1 Loss 75 -Subset Accuracy 25 +Example-wise F1 71.67 +Example-wise Jaccard 49.15 +Example-wise Precision 73.64 +Example-wise Recall 56.8 +Hamming Accuracy 78.4 +Hamming Loss 21.6 +Macro F1 59.79 +Macro Jaccard 43.82 +Macro Precision 69.16 +Macro Recall 53.66 +Micro F1 62.09 +Micro Jaccard 45.02 +Micro Precision 70.51 +Micro Recall 55.47 +Subset 0/1 Loss 76.02 +Subset Accuracy 23.98 INFO Successfully finished after diff --git a/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-width_nominal-features-sparse.txt b/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-width_nominal-features-sparse.txt index e003954c4e..2e59815687 100644 --- a/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-width_nominal-features-sparse.txt +++ b/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-width_nominal-features-sparse.txt @@ -12,21 +12,21 @@ DEBUG A dense matrix is used to store the predicted labels INFO Successfully predicted in INFO Evaluation result for test data: -Example-wise F1 70.36 -Example-wise Jaccard 48.17 -Example-wise Precision 73.89 -Example-wise Recall 55.53 -Hamming Accuracy 77.72 -Hamming Loss 22.28 -Macro F1 58.29 -Macro Jaccard 42.16 -Macro Precision 68.8 -Macro Recall 51.65 -Micro F1 60.42 -Micro Jaccard 43.29 -Micro Precision 69.69 -Micro Recall 53.33 -Subset 0/1 Loss 75.51 -Subset Accuracy 24.49 +Example-wise F1 67.86 +Example-wise Jaccard 46.6 +Example-wise Precision 76.79 +Example-wise Recall 53.32 +Hamming Accuracy 78.74 +Hamming Loss 21.26 +Macro F1 58.93 +Macro Jaccard 42.71 +Macro Precision 73.26 +Macro Recall 51.04 +Micro F1 61.18 +Micro Jaccard 44.07 +Micro Precision 73.23 +Micro Recall 52.53 +Subset 0/1 Loss 77.55 +Subset Accuracy 22.45 INFO Successfully finished after diff --git a/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-width_numerical-features-dense.txt b/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-width_numerical-features-dense.txt index fe0902805f..9f035f3ce9 100644 --- a/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-width_numerical-features-dense.txt +++ b/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-width_numerical-features-dense.txt @@ -12,21 +12,21 @@ DEBUG A dense matrix is used to store the predicted labels INFO Successfully predicted in INFO Evaluation result for test data: -Example-wise F1 71.6 -Example-wise Jaccard 54.51 -Example-wise Precision 81.12 -Example-wise Recall 61.9 -Hamming Accuracy 81.97 -Hamming Loss 18.03 -Macro F1 66.42 -Macro Jaccard 50.44 -Macro Precision 78.82 -Macro Recall 59.49 -Micro F1 68.36 -Micro Jaccard 51.93 -Micro Precision 77.63 -Micro Recall 61.07 -Subset 0/1 Loss 71.94 -Subset Accuracy 28.06 +Example-wise F1 63.64 +Example-wise Jaccard 44.73 +Example-wise Precision 78.91 +Example-wise Recall 50.85 +Hamming Accuracy 78.4 +Hamming Loss 21.6 +Macro F1 58.23 +Macro Jaccard 42.68 +Macro Precision 71.01 +Macro Recall 50.77 +Micro F1 60.06 +Micro Jaccard 42.92 +Micro Precision 73.18 +Micro Recall 50.93 +Subset 0/1 Loss 77.55 +Subset Accuracy 22.45 INFO Successfully finished after diff --git a/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-width_numerical-features-sparse.txt b/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-width_numerical-features-sparse.txt index 91186255cb..f938da8d18 100644 --- a/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-width_numerical-features-sparse.txt +++ b/python/subprojects/testbed/tests/res/out/boomer/feature-binning-equal-width_numerical-features-sparse.txt @@ -12,21 +12,21 @@ DEBUG A dense matrix is used to store the predicted labels INFO Successfully predicted in INFO Evaluation result for test data: -Example-wise F1 71.09 -Example-wise Jaccard 51.28 -Example-wise Precision 78.4 -Example-wise Recall 58.16 -Hamming Accuracy 80.61 -Hamming Loss 19.39 -Macro F1 62.69 -Macro Jaccard 47.34 -Macro Precision 73.96 -Macro Recall 56.39 -Micro F1 65.66 -Micro Jaccard 48.88 -Micro Precision 75.43 -Micro Recall 58.13 -Subset 0/1 Loss 73.98 -Subset Accuracy 26.02 +Example-wise F1 63.64 +Example-wise Jaccard 44.73 +Example-wise Precision 78.91 +Example-wise Recall 50.85 +Hamming Accuracy 78.4 +Hamming Loss 21.6 +Macro F1 58.23 +Macro Jaccard 42.68 +Macro Precision 71.01 +Macro Recall 50.77 +Micro F1 60.06 +Micro Jaccard 42.92 +Micro Precision 73.18 +Micro Recall 50.93 +Subset 0/1 Loss 77.55 +Subset Accuracy 22.45 INFO Successfully finished after