Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] Don't try and correct for sample count when estimating statistic variances for anomaly detection #2677

Draft
wants to merge 20 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 3 additions & 7 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@
* Update the Pytorch library to version 2.3.1. (See {ml-pull}2688[#2688].)
* Allow the user to force a detector to shift time series state by a specific amount.
(See {ml-pull}2695[#2695].)
* Improve variance estimation for anomaly detection when record counts
suddenly drop. (See {ml-pull}2677[#2677].)


== {es} version 8.15.0

Expand All @@ -53,13 +56,6 @@

* Handle any exception thrown by inference. (See {ml-pull}2680[#2680].)

== {es} version 8.14.1

=== Enhancements

* Improve memory allocation management for JSON processing to reduce memory usage.
(See {ml-pull}2679[#2679].)

== {es} version 8.14.0

=== Bug Fixes
Expand Down
15 changes: 15 additions & 0 deletions include/core/CSmallVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,21 @@ class CSmallVector : public boost::container::small_vector<T, N> {
return *this;
}

std::string toDelimited(const std::string& delimiter = ", ") const {
std::string result;
for (size_type i = 0; i < this->size(); ++i) {
result += std::to_string((*this)[i]);
if (i < this->size() - 1) {
result += delimiter;
}
// Reserve space to minimize concatenation overhead.
if (i == 0) {
result.reserve(result.size() * this->size());
}
}
return result;
}

private:
TBase& baseRef() { return *this; }
const TBase& baseRef() const { return *this; }
Expand Down
6 changes: 6 additions & 0 deletions include/maths/common/CBasicStatisticsPersist.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

#include <core/CHashing.h>
#include <core/CLogger.h>
#include <core/CSmallVector.h>
#include <core/CStringUtils.h>

#include <maths/common/CBasicStatistics.h>
Expand Down Expand Up @@ -99,6 +100,11 @@ template<typename T>
inline std::string typeToString(const CSymmetricMatrix<T>& value) {
return value.toDelimited();
}

template<typename T, std::size_t N>
inline std::string typeToString(const core::CSmallVector<T, N>& value) {
return value.toDelimited();
}
}

template<typename T, unsigned int ORDER>
Expand Down
2 changes: 1 addition & 1 deletion include/model/CAnomalyDetector.h
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ class MODEL_EXPORT CAnomalyDetector : public CMonitoredResource {
void resetBucket(core_t::TTime bucketStart);

//! Release memory that is no longer needed
void releaseMemory(core_t::TTime samplingCutoffTime);
void releaseMemory();

//! Print the detector memory usage to the given stream
void showMemoryUsage(std::ostream& stream) const;
Expand Down
14 changes: 0 additions & 14 deletions include/model/CAnomalyDetectorModelConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,17 +102,6 @@ class MODEL_EXPORT CAnomalyDetectorModelConfig {
//! Default maximum number of buckets for receiving out of order records.
static const std::size_t DEFAULT_LATENCY_BUCKETS;

//! Default amount by which metric sample count is reduced for fine-grained
//! sampling when there is no latency.
static const std::size_t DEFAULT_SAMPLE_COUNT_FACTOR_NO_LATENCY;

//! Default amount by which metric sample count is reduced for fine-grained
//! sampling when there is latency.
static const std::size_t DEFAULT_SAMPLE_COUNT_FACTOR_WITH_LATENCY;

//! Default amount by which the metric sample queue expands when it is full.
static const double DEFAULT_SAMPLE_QUEUE_GROWTH_FACTOR;

//! Bucket length corresponding to the default decay and learn rates.
static const core_t::TTime STANDARD_BUCKET_LENGTH;
//@}
Expand Down Expand Up @@ -444,9 +433,6 @@ class MODEL_EXPORT CAnomalyDetectorModelConfig {
//! bucket length.
double bucketNormalizationFactor() const;

//! The time window during which samples are accepted.
core_t::TTime samplingAgeCutoff() const;

private:
//! Bucket length.
core_t::TTime m_BucketLength{0};
Expand Down
9 changes: 2 additions & 7 deletions include/model/CBucketGatherer.h
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,7 @@ class MODEL_EXPORT CBucketGatherer {
virtual bool resetBucket(core_t::TTime bucketStart) = 0;

//! Release memory that is no longer needed
virtual void releaseMemory(core_t::TTime samplingCutoffTime) = 0;
virtual void releaseMemory() = 0;

//! Remove the values in queue for the people or attributes
//! in \p toRemove.
Expand Down Expand Up @@ -378,19 +378,14 @@ class MODEL_EXPORT CBucketGatherer {
//!
//! \param[in] time The time of interest.
//! \param[out] result Filled in with the feature data at \p time.
virtual void featureData(core_t::TTime time,
core_t::TTime bucketLength,
TFeatureAnyPrVec& result) const = 0;
virtual void featureData(core_t::TTime time, TFeatureAnyPrVec& result) const = 0;

//! Get a reference to the owning data gatherer.
const CDataGatherer& dataGatherer() const;

//! Has this pid/cid pair had only explicit null records?
bool hasExplicitNullsOnly(core_t::TTime time, std::size_t pid, std::size_t cid) const;

//! Create samples if possible for the bucket pointed out by \p time.
virtual void sample(core_t::TTime time) = 0;

//! Persist state by passing information \p inserter.
virtual void acceptPersistInserter(core::CStatePersistInserter& inserter) const = 0;

Expand Down
48 changes: 4 additions & 44 deletions include/model/CDataGatherer.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,10 +162,6 @@ class MODEL_EXPORT CDataGatherer {
//! \param[in] features The features of the data to model.
//! \param[in] startTime The start of the time interval for which
//! to gather data.
//! \param[in] sampleCountOverride for the number of measurements
//! in a statistic. (Note that this is intended for testing only.)
//! A zero value means that the data gatherer class will determine
//! an appropriate value for the bucket length and data rate.
CDataGatherer(model_t::EAnalysisCategory gathererType,
model_t::ESummaryMode summaryMode,
const SModelParams& modelParams,
Expand All @@ -177,8 +173,7 @@ class MODEL_EXPORT CDataGatherer {
const TStrVec& influenceFieldNames,
const CSearchKey& key,
const TFeatureVec& features,
core_t::TTime startTime,
int sampleCountOverride);
core_t::TTime startTime);

//! Construct from a state document.
CDataGatherer(model_t::EAnalysisCategory gathererType,
Expand Down Expand Up @@ -334,10 +329,9 @@ class MODEL_EXPORT CDataGatherer {
//! \tparam T The type of the feature data.
template<typename T>
bool featureData(core_t::TTime time,
core_t::TTime bucketLength,
std::vector<std::pair<model_t::EFeature, T>>& result) const {
TFeatureAnyPrVec rawFeatureData;
m_BucketGatherer->featureData(time, bucketLength, rawFeatureData);
m_BucketGatherer->featureData(time, rawFeatureData);

bool succeeded = true;

Expand Down Expand Up @@ -487,36 +481,6 @@ class MODEL_EXPORT CDataGatherer {
bool isAttributeActive(std::size_t cid) const;
//@}

//! \name Metric
//@{
//! Get the current number of measurements in a sample for
//! the model of the entity identified by \p id.
//!
//! If we are performing temporal analysis we have one sample
//! count per person and if we are performing population analysis
//! we have one sample count per attribute.
double sampleCount(std::size_t id) const;

//! Get the effective number of measurements in a sample for
//! the model of the entity identified by \p id.
//!
//! If we are performing temporal analysis we have one sample
//! count per person and if we are performing population analysis
//! we have one sample count per attribute.
double effectiveSampleCount(std::size_t id) const;

//! Reset the number of measurements in a sample for the entity
//! identified \p id.
//!
//! If we are performing individual analysis we have one sample
//! count per person and if we are performing population analysis
//! we have one sample count per attribute.
void resetSampleCount(std::size_t id);

//! Get the sample counts.
const TSampleCountsPtr& sampleCounts() const;
//@}

//! \name Time
//@{
//! Get the start of the current bucketing time interval.
Expand Down Expand Up @@ -578,7 +542,7 @@ class MODEL_EXPORT CDataGatherer {
bool resetBucket(core_t::TTime bucketStart);

//! Release memory that is no longer needed
void releaseMemory(core_t::TTime samplingCutoffTime);
void releaseMemory();

//! Get the global configuration parameters.
const SModelParams& params() const;
Expand Down Expand Up @@ -701,8 +665,7 @@ class MODEL_EXPORT CDataGatherer {
const std::string& attributeFieldName,
const std::string& valueFieldName,
const TStrVec& influenceFieldNames,
core_t::TTime startTime,
unsigned int sampleCountOverride);
core_t::TTime startTime);

private:
//! The type of the bucket gatherer(s) used.
Expand Down Expand Up @@ -739,9 +702,6 @@ class MODEL_EXPORT CDataGatherer {

//! If true the gatherer will process missing person field values.
bool m_UseNull;

//! The object responsible for managing sample counts.
TSampleCountsPtr m_SampleCounts;
};
}
}
Expand Down
9 changes: 2 additions & 7 deletions include/model/CEventRateBucketGatherer.h
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ class MODEL_EXPORT CEventRateBucketGatherer final : public CBucketGatherer {
bool resetBucket(core_t::TTime bucketStart) override;

//! Release memory that is no longer needed
void releaseMemory(core_t::TTime samplingCutoffTime) override;
void releaseMemory() override;

//! \name Features
//@{
Expand All @@ -268,15 +268,10 @@ class MODEL_EXPORT CEventRateBucketGatherer final : public CBucketGatherer {
//!
//! \param[in] time The time of interest.
//! \param[out] result Filled in with the feature data at \p time.
void featureData(core_t::TTime time,
core_t::TTime bucketLength,
TFeatureAnyPrVec& result) const override;
void featureData(core_t::TTime time, TFeatureAnyPrVec& result) const override;
//@}

private:
//! No-op.
void sample(core_t::TTime time) override;

//! Append the counts by person for the bucketing interval containing
//! \p time.
//!
Expand Down
Loading