add isolation forest algorithm

c9s · Jan 3, 2025 · 1493234 · 1493234
1 parent 31069e5
commit 1493234
Show file tree

Hide file tree

Showing 11 changed files with 597 additions and 4 deletions.
diff --git a/go.mod b/go.mod
@@ -125,7 +125,6 @@ require (
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/modern-go/reflect2 v1.0.2 // indirect
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
-	github.com/narumiruna/go-iforest v0.2.2 // indirect
 	github.com/pelletier/go-toml/v2 v2.2.3 // indirect
 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
 	github.com/prometheus/client_model v0.6.1 // indirect

diff --git a/go.sum b/go.sum
@@ -446,8 +446,6 @@ github.com/muesli/kmeans v0.3.0/go.mod h1:eNyybq0tX9/iBEP6EMU4Y7dpmGK0uEhODdZpnG
 github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
 github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
 github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
-github.com/narumiruna/go-iforest v0.2.2 h1:48GGRVLSlgtV3vGr+eedXODn5RT3WvYroqpMNEoQvkk=
-github.com/narumiruna/go-iforest v0.2.2/go.mod h1:2pumoiqKf0Lr+KvLECMC8uNrbRkxtSvUwMJC/6AW7DM=
 github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE=
 github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU=
 github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U=

diff --git a/pkg/ensemble/iforest/forest.go b/pkg/ensemble/iforest/forest.go
@@ -0,0 +1,175 @@
+package iforest
+
+import (
+	"math"
+	"math/rand"
+	"sync"
+)
+
+const (
+	defaultNumTrees       = 100
+	defaultSampleSize     = 256
+	defaultScoreThreshold = 0.6
+	defaultDetectionType  = DetectionTypeThreshold
+	offset                = 0.5
+)
+
+type DetectionType string
+
+const (
+	DetectionTypeThreshold  DetectionType = "threshold"
+	DetectionTypeProportion DetectionType = "proportion"
+)
+
+type Options struct {
+	DetectionType DetectionType `json:"detectionType"`
+	Threshold     float64       `json:"threshold"`
+	Proportion    float64       `json:"proportion"`
+	NumTrees      int           `json:"numTrees"`
+	SampleSize    int           `json:"sampleSize"`
+	MaxDepth      int           `json:"maxDepth"`
+}
+
+// SetDefaultValues initializes zero-valued fields in Options with defaults.
+func (o *Options) SetDefaultValues() {
+	if o.DetectionType == "" {
+		o.DetectionType = defaultDetectionType
+	}
+
+	if o.Threshold == 0 {
+		o.Threshold = defaultScoreThreshold
+	}
+
+	if o.NumTrees == 0 {
+		o.NumTrees = defaultNumTrees
+	}
+
+	if o.SampleSize == 0 {
+		o.SampleSize = defaultSampleSize
+	}
+
+	if o.MaxDepth == 0 {
+		o.MaxDepth = int(math.Ceil(math.Log2(float64(o.SampleSize))))
+	}
+}
+
+// IsolationForest is an anomaly detection model using isolation trees.
+type IsolationForest struct {
+	*Options
+
+	Trees []*TreeNode
+}
+
+// New creates an IsolationForest with default options.
+func New() *IsolationForest {
+	options := &Options{}
+	options.SetDefaultValues()
+	return &IsolationForest{Options: options}
+}
+
+// NewWithOptions creates an IsolationForest with the specified options.
+func NewWithOptions(options Options) *IsolationForest {
+	options.SetDefaultValues()
+	return &IsolationForest{Options: &options}
+}
+
+// Fit builds isolation trees from the training samples.
+func (f *IsolationForest) Fit(samples [][]float64) {
+	wg := sync.WaitGroup{}
+	wg.Add(f.NumTrees)
+
+	f.Trees = make([]*TreeNode, f.NumTrees)
+	for i := 0; i < f.NumTrees; i++ {
+		sampled := SampleRows(samples, f.SampleSize)
+		go func(index int) {
+			defer wg.Done()
+			tree := f.BuildTree(sampled, 0)
+			f.Trees[index] = tree
+		}(i)
+	}
+	wg.Wait()
+}
+
+// BuildTree recursively splits the data to isolate outliers.
+func (f *IsolationForest) BuildTree(samples [][]float64, depth int) *TreeNode {
+	numSamples := len(samples)
+	if numSamples == 0 {
+		return &TreeNode{}
+	}
+	numFeatures := len(samples[0])
+	if depth >= f.MaxDepth || numSamples <= 1 {
+		return &TreeNode{Size: numSamples}
+	}
+
+	splitIndex := rand.Intn(numFeatures)
+	column := Column(samples, splitIndex)
+	minValue, maxValue := MinMax(column)
+	splitValue := rand.Float64()*(maxValue-minValue) + minValue
+
+	leftSamples := make([][]float64, 0)
+	rightSamples := make([][]float64, 0)
+	for _, sample := range samples {
+		if sample[splitIndex] < splitValue {
+			leftSamples = append(leftSamples, sample)
+		} else {
+			rightSamples = append(rightSamples, sample)
+		}
+	}
+
+	return &TreeNode{
+		Left:       f.BuildTree(leftSamples, depth+1),
+		Right:      f.BuildTree(rightSamples, depth+1),
+		SplitIndex: splitIndex,
+		SplitValue: splitValue,
+	}
+}
+
+// Score calculates anomaly scores for each sample based on path lengths.
+func (f *IsolationForest) Score(samples [][]float64) []float64 {
+	scores := make([]float64, len(samples))
+	for i, sample := range samples {
+		score := 0.0
+		for _, tree := range f.Trees {
+			score += pathLength(sample, tree, 0)
+		}
+		scores[i] = math.Pow(2.0, -score/float64(len(f.Trees))/averagePathLength(float64(f.SampleSize)))
+	}
+	return scores
+}
+
+// Predict classifies samples as 1 (outlier) or 0 (normal) using the chosen detection type.
+func (f *IsolationForest) Predict(samples [][]float64) []int {
+	predictions := make([]int, len(samples))
+	scores := f.Score(samples)
+
+	var threshold float64
+	switch f.DetectionType {
+	case DetectionTypeThreshold:
+		threshold = f.Threshold
+	case DetectionTypeProportion:
+		threshold = Quantile(f.Score(samples), 1-f.Proportion)
+	default:
+		panic("Invalid detection type")
+	}
+
+	for i, score := range scores {
+		if score >= threshold {
+			predictions[i] = 1
+		} else {
+			predictions[i] = 0
+		}
+	}
+
+	return predictions
+}
+
+// FeatureImportance sums up each feature's usage across all trees for a provided sample.
+func (f *IsolationForest) FeatureImportance(sample []float64) []int {
+	importance := make([]int, len(sample))
+	for _, tree := range f.Trees {
+		for i, value := range tree.FeatureImportance(sample) {
+			importance[i] += value
+		}
+	}
+	return importance
+}
diff --git a/pkg/ensemble/iforest/forest_test.go b/pkg/ensemble/iforest/forest_test.go
@@ -0,0 +1,59 @@
+package iforest
+
+import (
+	"testing"
+)
+
+func TestIsolationForest(t *testing.T) {
+	tests := []struct {
+		features    [][]float64
+		predictions []int
+	}{
+		{
+			[][]float64{
+				{0, 0, 0},
+				{0, 0, 0},
+				{0, 0, 0},
+				{1, 1, 1},
+			},
+			[]int{0, 0, 0, 1},
+		},
+	}
+
+	for _, tt := range tests {
+		forest := New()
+		forest.Fit(tt.features)
+
+		preds := forest.Predict(tt.features)
+		for i, pred := range preds {
+			if pred != tt.predictions[i] {
+				t.Errorf("expected %v, got %v", tt.predictions[i], pred)
+			}
+		}
+	}
+}
+
+func TestIsolationForestOnRandomSamples(t *testing.T) {
+	dim := 2
+
+	samples := RandomMatrix(1000, dim)
+
+	forest := New()
+	forest.Fit(samples)
+
+	outliers := RandomMatrix(5, dim)
+	outliers = AddScalar(outliers, 10)
+	preds := forest.Predict(outliers)
+	for _, pred := range preds {
+		if pred != 1 {
+			t.Errorf("expected 1, got %v", pred)
+		}
+	}
+
+	preds = forest.Predict(SampleRows(samples, 5))
+	for _, pred := range preds {
+		if pred != 0 {
+			t.Errorf("expected 0, got %v", pred)
+		}
+	}
+}
diff --git a/pkg/ensemble/iforest/matrix.go b/pkg/ensemble/iforest/matrix.go
@@ -0,0 +1,69 @@
+package iforest
+
+import (
+	"math"
+	"math/rand"
+)
+
+// SampleRows randomly selects 'size' rows from the matrix.
+func SampleRows(matrix [][]float64, size int) [][]float64 {
+	if size <= 0 {
+		panic("size must be greater than 0")
+	}
+
+	if len(matrix) <= size {
+		return matrix
+	}
+
+	perm := rand.Perm(len(matrix))
+	sampled := make([][]float64, size)
+	for i := 0; i < size; i++ {
+		sampled[i] = matrix[perm[i]]
+	}
+	return sampled
+}
+
+// Column returns a slice containing the specified column from the matrix.
+func Column(matrix [][]float64, columnIndex int) []float64 {
+	column := make([]float64, len(matrix))
+	for i, row := range matrix {
+		column[i] = row[columnIndex]
+	}
+	return column
+}
+
+// MinMax returns the minimum and maximum values from a slice of float64.
+func MinMax(floats []float64) (float64, float64) {
+	min, max := math.Inf(1), math.Inf(-1)
+	for _, v := range floats {
+		if v < min {
+			min = v
+		}
+		if v > max {
+			max = v
+		}
+	}
+	return min, max
+}
+
+func RandomMatrix(rows, cols int) [][]float64 {
+	matrix := make([][]float64, rows)
+	for i := 0; i < rows; i++ {
+		matrix[i] = make([]float64, cols)
+		for j := 0; j < cols; j++ {
+			matrix[i][j] = rand.Float64()
+		}
+	}
+	return matrix
+}
+
+func AddScalar(matrix [][]float64, scalar float64) [][]float64 {
+	outputMatrix := make([][]float64, len(matrix))
+	for i, row := range matrix {
+		outputMatrix[i] = make([]float64, len(row))
+		for j := range row {
+			outputMatrix[i][j] = matrix[i][j] + scalar
+		}
+	}
+	return outputMatrix
+}