From 75fe3d528fa34067b7f7416043e0a6bb167f74b6 Mon Sep 17 00:00:00 2001 From: Michael Rapp Date: Fri, 13 Oct 2023 14:20:19 +0200 Subject: [PATCH 1/4] Add unit tests for the class EqualFeatureVector. --- cpp/subprojects/common/meson.build | 1 + .../common/input/feature_vector_equal.cpp | 39 +++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 cpp/subprojects/common/test/mlrl/common/input/feature_vector_equal.cpp diff --git a/cpp/subprojects/common/meson.build b/cpp/subprojects/common/meson.build index a181587745..b63e0a9744 100644 --- a/cpp/subprojects/common/meson.build +++ b/cpp/subprojects/common/meson.build @@ -128,6 +128,7 @@ test_files = [ 'test/mlrl/common/input/feature_type_nominal.cpp', 'test/mlrl/common/input/feature_type_numerical.cpp', 'test/mlrl/common/input/feature_type_ordinal.cpp', + 'test/mlrl/common/input/feature_vector_equal.cpp', 'test/mlrl/common/info.cpp' ] diff --git a/cpp/subprojects/common/test/mlrl/common/input/feature_vector_equal.cpp b/cpp/subprojects/common/test/mlrl/common/input/feature_vector_equal.cpp new file mode 100644 index 0000000000..2fe1ccc497 --- /dev/null +++ b/cpp/subprojects/common/test/mlrl/common/input/feature_vector_equal.cpp @@ -0,0 +1,39 @@ +#include "mlrl/common/input/feature_vector_equal.hpp" + +#include + +TEST(EqualFeatureVectorTest, createFilteredFeatureVectorFromIndices) { + EqualFeatureVector featureVector; + std::unique_ptr existing; + std::unique_ptr filtered = featureVector.createFilteredFeatureVector(existing, 0, 1); + const EqualFeatureVector* filteredFeatureVector = dynamic_cast(filtered.get()); + EXPECT_TRUE(filteredFeatureVector != nullptr); +} + +TEST(EqualFeatureVectorTest, createFilteredFeatureVectorFromIndicesUsingExisting) { + EqualFeatureVector featureVector; + std::unique_ptr existing = std::make_unique(); + std::unique_ptr filtered = featureVector.createFilteredFeatureVector(existing, 0, 1); + const EqualFeatureVector* filteredFeatureVector = dynamic_cast(filtered.get()); + EXPECT_TRUE(filteredFeatureVector != nullptr); + EXPECT_TRUE(existing.get() == nullptr); +} + +TEST(EqualFeatureVectorTest, createFilteredFeatureVectorFromCoverageMask) { + EqualFeatureVector featureVector; + std::unique_ptr existing; + CoverageMask coverageMask(10); + std::unique_ptr filtered = featureVector.createFilteredFeatureVector(existing, coverageMask); + const EqualFeatureVector* filteredFeatureVector = dynamic_cast(filtered.get()); + EXPECT_TRUE(filteredFeatureVector != nullptr); +} + +TEST(EqualFeatureVectorTest, createFilteredFeatureVectorFromCoverageMaskUsingExisting) { + EqualFeatureVector featureVector; + std::unique_ptr existing = std::make_unique(); + CoverageMask coverageMask(10); + std::unique_ptr filtered = featureVector.createFilteredFeatureVector(existing, coverageMask); + const EqualFeatureVector* filteredFeatureVector = dynamic_cast(filtered.get()); + EXPECT_TRUE(filteredFeatureVector != nullptr); + EXPECT_TRUE(existing.get() == nullptr); +} From 6785e0a5e1d07b47adf271e506f457a1972f7e8c Mon Sep 17 00:00:00 2001 From: Michael Rapp Date: Fri, 13 Oct 2023 15:01:40 +0200 Subject: [PATCH 2/4] Change visibility of the field "indptr_" of the class NominalFeatureVector from private to protected. --- .../mlrl/common/input/feature_vector_nominal.hpp | 10 ++++++++-- .../src/mlrl/common/input/feature_vector_nominal.cpp | 4 ++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/cpp/subprojects/common/include/mlrl/common/input/feature_vector_nominal.hpp b/cpp/subprojects/common/include/mlrl/common/input/feature_vector_nominal.hpp index 727bf06681..6b2271da21 100644 --- a/cpp/subprojects/common/include/mlrl/common/input/feature_vector_nominal.hpp +++ b/cpp/subprojects/common/include/mlrl/common/input/feature_vector_nominal.hpp @@ -16,12 +16,18 @@ class NominalFeatureVector : public AbstractFeatureVector { uint32* indices_; - uint32* indptr_; - const uint32 numValues_; const int32 majorityValue_; + protected: + + /** + * A pointer to an array that stores the indices of the first element in `indices_` that corresponds to a + * certain value. + */ + uint32* indptr_; + public: /** diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_vector_nominal.cpp b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_nominal.cpp index bdad091bdf..13a8cad4ec 100644 --- a/cpp/subprojects/common/src/mlrl/common/input/feature_vector_nominal.cpp +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_nominal.cpp @@ -1,8 +1,8 @@ #include "mlrl/common/input/feature_vector_nominal.hpp" NominalFeatureVector::NominalFeatureVector(uint32 numValues, uint32 numExamples, int32 majorityValue) - : values_(new int32[numValues]), indices_(new uint32[numExamples]), indptr_(new uint32[numValues + 1]), - numValues_(numValues), majorityValue_(majorityValue) { + : values_(new int32[numValues]), indices_(new uint32[numExamples]), numValues_(numValues), + majorityValue_(majorityValue), indptr_(new uint32[numValues + 1]) { indptr_[numValues] = numExamples; } From 4a4cd89c0ed15475602aab93e32a951637241d19 Mon Sep 17 00:00:00 2001 From: Michael Rapp Date: Fri, 13 Oct 2023 15:12:58 +0200 Subject: [PATCH 3/4] Change the visibility of the field "indices_" of the class BinaryFeatureVector from private to protected. --- .../mlrl/common/input/feature_vector_nominal.hpp | 7 +++++-- .../mlrl/common/input/feature_vector_nominal.cpp | 13 ++++++++----- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/cpp/subprojects/common/include/mlrl/common/input/feature_vector_nominal.hpp b/cpp/subprojects/common/include/mlrl/common/input/feature_vector_nominal.hpp index 6b2271da21..805fcf60b3 100644 --- a/cpp/subprojects/common/include/mlrl/common/input/feature_vector_nominal.hpp +++ b/cpp/subprojects/common/include/mlrl/common/input/feature_vector_nominal.hpp @@ -14,14 +14,17 @@ class NominalFeatureVector : public AbstractFeatureVector { int32* values_; - uint32* indices_; - const uint32 numValues_; const int32 majorityValue_; protected: + /** + * A pointer to an array that stores the indices of all examples not associated with the majority value. + */ + uint32* indices_; + /** * A pointer to an array that stores the indices of the first element in `indices_` that corresponds to a * certain value. diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_vector_nominal.cpp b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_nominal.cpp index 13a8cad4ec..75cdd30f86 100644 --- a/cpp/subprojects/common/src/mlrl/common/input/feature_vector_nominal.cpp +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_nominal.cpp @@ -1,15 +1,18 @@ #include "mlrl/common/input/feature_vector_nominal.hpp" +#include + NominalFeatureVector::NominalFeatureVector(uint32 numValues, uint32 numExamples, int32 majorityValue) - : values_(new int32[numValues]), indices_(new uint32[numExamples]), numValues_(numValues), - majorityValue_(majorityValue), indptr_(new uint32[numValues + 1]) { + : values_((int32*) malloc(numValues * sizeof(int32))), numValues_(numValues), majorityValue_(majorityValue), + indices_((uint32*) malloc(numExamples * sizeof(uint32))), + indptr_((uint32*) malloc((numValues + 1) * sizeof(uint32))) { indptr_[numValues] = numExamples; } NominalFeatureVector::~NominalFeatureVector() { - delete[] values_; - delete[] indices_; - delete[] indptr_; + free(values_); + free(indices_); + free(indptr_); } NominalFeatureVector::value_iterator NominalFeatureVector::values_begin() { From cbecc608161be81962180f9606efc3fba7e60e1d Mon Sep 17 00:00:00 2001 From: Michael Rapp Date: Fri, 13 Oct 2023 16:07:06 +0200 Subject: [PATCH 4/4] Implement "createFilteredFeatureVector" functions of the class BinaryFeatureVector. --- cpp/subprojects/common/meson.build | 1 + .../common/input/feature_vector_binary.cpp | 61 ++++++- .../common/input/feature_vector_binary.cpp | 153 ++++++++++++++++++ 3 files changed, 211 insertions(+), 4 deletions(-) create mode 100644 cpp/subprojects/common/test/mlrl/common/input/feature_vector_binary.cpp diff --git a/cpp/subprojects/common/meson.build b/cpp/subprojects/common/meson.build index b63e0a9744..8935e30291 100644 --- a/cpp/subprojects/common/meson.build +++ b/cpp/subprojects/common/meson.build @@ -128,6 +128,7 @@ test_files = [ 'test/mlrl/common/input/feature_type_nominal.cpp', 'test/mlrl/common/input/feature_type_numerical.cpp', 'test/mlrl/common/input/feature_type_ordinal.cpp', + 'test/mlrl/common/input/feature_vector_binary.cpp', 'test/mlrl/common/input/feature_vector_equal.cpp', 'test/mlrl/common/info.cpp' ] diff --git a/cpp/subprojects/common/src/mlrl/common/input/feature_vector_binary.cpp b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_binary.cpp index 66b7146f4d..25b4cb6353 100644 --- a/cpp/subprojects/common/src/mlrl/common/input/feature_vector_binary.cpp +++ b/cpp/subprojects/common/src/mlrl/common/input/feature_vector_binary.cpp @@ -1,5 +1,7 @@ #include "mlrl/common/input/feature_vector_binary.hpp" +#include "mlrl/common/input/feature_vector_equal.hpp" + BinaryFeatureVector::BinaryFeatureVector(uint32 numMinorityExamples, int32 minorityValue, int32 majorityValue) : NominalFeatureVector(1, numMinorityExamples, majorityValue) { this->values_begin()[0] = minorityValue; @@ -8,12 +10,63 @@ BinaryFeatureVector::BinaryFeatureVector(uint32 numMinorityExamples, int32 minor std::unique_ptr BinaryFeatureVector::createFilteredFeatureVector( std::unique_ptr& existing, uint32 start, uint32 end) const { - // TODO Implement - return nullptr; + return std::make_unique(); } std::unique_ptr BinaryFeatureVector::createFilteredFeatureVector( std::unique_ptr& existing, const CoverageMask& coverageMask) const { - // TODO Implement - return nullptr; + index_const_iterator indexIterator = this->indices_cbegin(0); + index_const_iterator indicesEnd = this->indices_cend(0); + uint32 maxIndices = indicesEnd - indexIterator; + std::unique_ptr filteredFeatureVectorPtr; + BinaryFeatureVector* existingPtr = dynamic_cast(existing.get()); + + if (existingPtr) { + existing.release(); + filteredFeatureVectorPtr = std::unique_ptr(existingPtr); + + // Filter the indices of examples with missing feature values... + for (auto it = filteredFeatureVectorPtr->missing_indices_cbegin(); + it != filteredFeatureVectorPtr->missing_indices_cend();) { + uint32 index = *it; + it++; + + if (!coverageMask.isCovered(index)) { + filteredFeatureVectorPtr->setMissing(index, false); + } + } + } else { + filteredFeatureVectorPtr = + std::make_unique(maxIndices, this->values_cbegin()[0], this->getMajorityValue()); + + // Add the indices of examples with missing feature values... + for (auto it = this->missing_indices_cbegin(); it != this->missing_indices_cend(); it++) { + uint32 index = *it; + + if (coverageMask.isCovered(index)) { + filteredFeatureVectorPtr->setMissing(index, true); + } + } + } + + // Filter the indices of examples associated with the minority value... + index_iterator filteredIndexIterator = filteredFeatureVectorPtr->indices_begin(0); + uint32 n = 0; + + for (uint32 i = 0; i < maxIndices; i++) { + uint32 index = indexIterator[i]; + + if (coverageMask.isCovered(index)) { + filteredIndexIterator[n] = index; + n++; + } + } + + if (n > 0) { + filteredFeatureVectorPtr->indices_ = (uint32*) realloc(filteredFeatureVectorPtr->indices_, n * sizeof(uint32)); + filteredFeatureVectorPtr->indptr_[1] = n; + return filteredFeatureVectorPtr; + } + + return std::make_unique(); } diff --git a/cpp/subprojects/common/test/mlrl/common/input/feature_vector_binary.cpp b/cpp/subprojects/common/test/mlrl/common/input/feature_vector_binary.cpp new file mode 100644 index 0000000000..7ed55b58d0 --- /dev/null +++ b/cpp/subprojects/common/test/mlrl/common/input/feature_vector_binary.cpp @@ -0,0 +1,153 @@ +#include "mlrl/common/input/feature_vector_binary.hpp" + +#include "mlrl/common/input/feature_vector_equal.hpp" + +#include + +TEST(BinaryFeatureVectorTest, createFilteredFeatureVectorFromIndices) { + BinaryFeatureVector featureVector(10, 0, 1); + std::unique_ptr existing; + std::unique_ptr filtered = featureVector.createFilteredFeatureVector(existing, 0, 1); + const EqualFeatureVector* filteredFeatureVector = dynamic_cast(filtered.get()); + EXPECT_TRUE(filteredFeatureVector != nullptr); +} + +TEST(BinaryFeatureVectorTest, createFilteredFeatureVectorFromCoverageMask) { + uint32 numMinorityExamples = 10; + BinaryFeatureVector featureVector(numMinorityExamples, 0, 1); + BinaryFeatureVector::index_iterator indexIterator = featureVector.indices_begin(0); + + for (uint32 i = 0; i < numMinorityExamples; i++) { + indexIterator[i] = i; + } + + uint32 numMissingIndices = 10; + + for (uint32 i = numMinorityExamples; i < numMinorityExamples + numMissingIndices; i++) { + featureVector.setMissing(i, true); + } + + CoverageMask coverageMask(numMinorityExamples + numMissingIndices); + uint32 indicatorValue = 1; + coverageMask.setIndicatorValue(indicatorValue); + CoverageMask::iterator coverageMaskIterator = coverageMask.begin(); + + for (uint32 i = 0; i < numMinorityExamples + numMissingIndices; i++) { + if (i % 2 == 0) { + coverageMaskIterator[i] = indicatorValue; + } + } + + std::unique_ptr existing; + std::unique_ptr filtered = featureVector.createFilteredFeatureVector(existing, coverageMask); + const BinaryFeatureVector* filteredFeatureVector = dynamic_cast(filtered.get()); + EXPECT_TRUE(filteredFeatureVector != nullptr); + + // Check filtered indices... + BinaryFeatureVector::index_const_iterator indicesBegin = filteredFeatureVector->indices_cbegin(0); + BinaryFeatureVector::index_const_iterator indicesEnd = filteredFeatureVector->indices_cend(0); + uint32 numIndices = indicesEnd - indicesBegin; + EXPECT_EQ(numIndices, numMinorityExamples / 2); + std::unordered_set indices; + + for (auto it = indicesBegin; it != indicesEnd; it++) { + indices.emplace(*it); + } + + for (uint32 i = 0; i < numMinorityExamples; i++) { + if (i % 2 == 0) { + EXPECT_TRUE(indices.find(i) != indices.end()); + } else { + EXPECT_TRUE(indices.find(i) == indices.end()); + } + } + + // Check missing indices... + for (uint32 i = numMinorityExamples; i < numMinorityExamples + numMissingIndices; i++) { + if (i % 2 == 0) { + EXPECT_TRUE(filteredFeatureVector->isMissing(i)); + } else { + EXPECT_FALSE(filteredFeatureVector->isMissing(i)); + } + } +} + +TEST(BinaryFeatureVectorTest, createFilteredFeatureVectorFromCoverageMaskUsingExisting) { + uint32 numMinorityExamples = 10; + std::unique_ptr featureVector = + std::make_unique(numMinorityExamples, 0, 1); + BinaryFeatureVector::index_iterator indexIterator = featureVector->indices_begin(0); + + for (uint32 i = 0; i < numMinorityExamples; i++) { + indexIterator[i] = i; + } + + uint32 numMissingIndices = 10; + + for (uint32 i = numMinorityExamples; i < numMinorityExamples + numMissingIndices; i++) { + featureVector->setMissing(i, true); + } + + CoverageMask coverageMask(numMinorityExamples + numMissingIndices); + uint32 indicatorValue = 1; + coverageMask.setIndicatorValue(indicatorValue); + CoverageMask::iterator coverageMaskIterator = coverageMask.begin(); + + for (uint32 i = 0; i < numMinorityExamples + numMissingIndices; i++) { + if (i % 2 == 0) { + coverageMaskIterator[i] = indicatorValue; + } + } + + std::unique_ptr existing = std::move(featureVector); + std::unique_ptr filtered = existing->createFilteredFeatureVector(existing, coverageMask); + const BinaryFeatureVector* filteredFeatureVector = dynamic_cast(filtered.get()); + EXPECT_TRUE(filteredFeatureVector != nullptr); + EXPECT_TRUE(existing.get() == nullptr); + + // Check filtered indices... + BinaryFeatureVector::index_const_iterator indicesBegin = filteredFeatureVector->indices_cbegin(0); + BinaryFeatureVector::index_const_iterator indicesEnd = filteredFeatureVector->indices_cend(0); + uint32 numIndices = indicesEnd - indicesBegin; + EXPECT_EQ(numIndices, numMinorityExamples / 2); + std::unordered_set indices; + + for (auto it = indicesBegin; it != indicesEnd; it++) { + indices.emplace(*it); + } + + for (uint32 i = 0; i < numMinorityExamples; i++) { + if (i % 2 == 0) { + EXPECT_TRUE(indices.find(i) != indices.end()); + } else { + EXPECT_TRUE(indices.find(i) == indices.end()); + } + } + + // Check missing indices... + for (uint32 i = numMinorityExamples; i < numMinorityExamples + numMissingIndices; i++) { + if (i % 2 == 0) { + EXPECT_TRUE(filteredFeatureVector->isMissing(i)); + } else { + EXPECT_FALSE(filteredFeatureVector->isMissing(i)); + } + } +} + +TEST(BinaryFeatureVectorTest, createFilteredFeatureVectorFromCoverageMaskReturnsEqualFeatureVector) { + uint32 numMinorityExamples = 10; + BinaryFeatureVector featureVector(numMinorityExamples, 0, 1); + BinaryFeatureVector::index_iterator indexIterator = featureVector.indices_begin(0); + + for (uint32 i = 0; i < numMinorityExamples; i++) { + indexIterator[i] = i; + } + + CoverageMask coverageMask(numMinorityExamples); + coverageMask.setIndicatorValue(1); + + std::unique_ptr existing; + std::unique_ptr filtered = featureVector.createFilteredFeatureVector(existing, coverageMask); + const EqualFeatureVector* filteredFeatureVector = dynamic_cast(filtered.get()); + EXPECT_TRUE(filteredFeatureVector != nullptr); +}