Skip to content

Commit

Permalink
add isolation forest algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
narumiruna committed Jan 3, 2025
1 parent 31069e5 commit 1493234
Show file tree
Hide file tree
Showing 11 changed files with 597 additions and 4 deletions.
1 change: 0 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ require (
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/narumiruna/go-iforest v0.2.2 // indirect
github.com/pelletier/go-toml/v2 v2.2.3 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
Expand Down
2 changes: 0 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -446,8 +446,6 @@ github.com/muesli/kmeans v0.3.0/go.mod h1:eNyybq0tX9/iBEP6EMU4Y7dpmGK0uEhODdZpnG
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
github.com/narumiruna/go-iforest v0.2.2 h1:48GGRVLSlgtV3vGr+eedXODn5RT3WvYroqpMNEoQvkk=
github.com/narumiruna/go-iforest v0.2.2/go.mod h1:2pumoiqKf0Lr+KvLECMC8uNrbRkxtSvUwMJC/6AW7DM=
github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE=
github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU=
github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U=
Expand Down
175 changes: 175 additions & 0 deletions pkg/ensemble/iforest/forest.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
package iforest

import (
"math"
"math/rand"
"sync"
)

const (
defaultNumTrees = 100
defaultSampleSize = 256
defaultScoreThreshold = 0.6
defaultDetectionType = DetectionTypeThreshold
offset = 0.5
)

type DetectionType string

const (
DetectionTypeThreshold DetectionType = "threshold"
DetectionTypeProportion DetectionType = "proportion"
)

type Options struct {
DetectionType DetectionType `json:"detectionType"`
Threshold float64 `json:"threshold"`
Proportion float64 `json:"proportion"`
NumTrees int `json:"numTrees"`
SampleSize int `json:"sampleSize"`
MaxDepth int `json:"maxDepth"`
}

// SetDefaultValues initializes zero-valued fields in Options with defaults.
func (o *Options) SetDefaultValues() {
if o.DetectionType == "" {
o.DetectionType = defaultDetectionType
}

if o.Threshold == 0 {
o.Threshold = defaultScoreThreshold
}

if o.NumTrees == 0 {
o.NumTrees = defaultNumTrees
}

if o.SampleSize == 0 {
o.SampleSize = defaultSampleSize
}

if o.MaxDepth == 0 {
o.MaxDepth = int(math.Ceil(math.Log2(float64(o.SampleSize))))
}
}

// IsolationForest is an anomaly detection model using isolation trees.
type IsolationForest struct {
*Options

Trees []*TreeNode
}

// New creates an IsolationForest with default options.
func New() *IsolationForest {
options := &Options{}
options.SetDefaultValues()
return &IsolationForest{Options: options}
}

// NewWithOptions creates an IsolationForest with the specified options.
func NewWithOptions(options Options) *IsolationForest {
options.SetDefaultValues()
return &IsolationForest{Options: &options}
}

// Fit builds isolation trees from the training samples.
func (f *IsolationForest) Fit(samples [][]float64) {
wg := sync.WaitGroup{}
wg.Add(f.NumTrees)

f.Trees = make([]*TreeNode, f.NumTrees)
for i := 0; i < f.NumTrees; i++ {
sampled := SampleRows(samples, f.SampleSize)
go func(index int) {
defer wg.Done()
tree := f.BuildTree(sampled, 0)
f.Trees[index] = tree
}(i)
}
wg.Wait()
}

// BuildTree recursively splits the data to isolate outliers.
func (f *IsolationForest) BuildTree(samples [][]float64, depth int) *TreeNode {
numSamples := len(samples)
if numSamples == 0 {
return &TreeNode{}
}
numFeatures := len(samples[0])
if depth >= f.MaxDepth || numSamples <= 1 {
return &TreeNode{Size: numSamples}
}

splitIndex := rand.Intn(numFeatures)
column := Column(samples, splitIndex)
minValue, maxValue := MinMax(column)
splitValue := rand.Float64()*(maxValue-minValue) + minValue

leftSamples := make([][]float64, 0)
rightSamples := make([][]float64, 0)
for _, sample := range samples {
if sample[splitIndex] < splitValue {
leftSamples = append(leftSamples, sample)
} else {
rightSamples = append(rightSamples, sample)
}
}

return &TreeNode{
Left: f.BuildTree(leftSamples, depth+1),
Right: f.BuildTree(rightSamples, depth+1),
SplitIndex: splitIndex,
SplitValue: splitValue,
}
}

// Score calculates anomaly scores for each sample based on path lengths.
func (f *IsolationForest) Score(samples [][]float64) []float64 {
scores := make([]float64, len(samples))
for i, sample := range samples {
score := 0.0
for _, tree := range f.Trees {
score += pathLength(sample, tree, 0)
}
scores[i] = math.Pow(2.0, -score/float64(len(f.Trees))/averagePathLength(float64(f.SampleSize)))
}
return scores
}

// Predict classifies samples as 1 (outlier) or 0 (normal) using the chosen detection type.
func (f *IsolationForest) Predict(samples [][]float64) []int {
predictions := make([]int, len(samples))
scores := f.Score(samples)

var threshold float64
switch f.DetectionType {
case DetectionTypeThreshold:
threshold = f.Threshold
case DetectionTypeProportion:
threshold = Quantile(f.Score(samples), 1-f.Proportion)
default:
panic("Invalid detection type")
}

for i, score := range scores {
if score >= threshold {
predictions[i] = 1
} else {
predictions[i] = 0
}
}

return predictions
}

// FeatureImportance sums up each feature's usage across all trees for a provided sample.
func (f *IsolationForest) FeatureImportance(sample []float64) []int {
importance := make([]int, len(sample))
for _, tree := range f.Trees {
for i, value := range tree.FeatureImportance(sample) {
importance[i] += value
}
}
return importance
}
59 changes: 59 additions & 0 deletions pkg/ensemble/iforest/forest_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
package iforest

import (
"testing"
)

func TestIsolationForest(t *testing.T) {
tests := []struct {
features [][]float64
predictions []int
}{
{
[][]float64{
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{1, 1, 1},
},
[]int{0, 0, 0, 1},
},
}

for _, tt := range tests {
forest := New()
forest.Fit(tt.features)

preds := forest.Predict(tt.features)
for i, pred := range preds {
if pred != tt.predictions[i] {
t.Errorf("expected %v, got %v", tt.predictions[i], pred)
}
}
}
}

func TestIsolationForestOnRandomSamples(t *testing.T) {
dim := 2

samples := RandomMatrix(1000, dim)

forest := New()
forest.Fit(samples)

outliers := RandomMatrix(5, dim)
outliers = AddScalar(outliers, 10)
preds := forest.Predict(outliers)
for _, pred := range preds {
if pred != 1 {
t.Errorf("expected 1, got %v", pred)
}
}

preds = forest.Predict(SampleRows(samples, 5))
for _, pred := range preds {
if pred != 0 {
t.Errorf("expected 0, got %v", pred)
}
}
}
69 changes: 69 additions & 0 deletions pkg/ensemble/iforest/matrix.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
package iforest

import (
"math"
"math/rand"
)

// SampleRows randomly selects 'size' rows from the matrix.
func SampleRows(matrix [][]float64, size int) [][]float64 {
if size <= 0 {
panic("size must be greater than 0")
}

if len(matrix) <= size {
return matrix
}

perm := rand.Perm(len(matrix))
sampled := make([][]float64, size)
for i := 0; i < size; i++ {
sampled[i] = matrix[perm[i]]
}
return sampled
}

// Column returns a slice containing the specified column from the matrix.
func Column(matrix [][]float64, columnIndex int) []float64 {
column := make([]float64, len(matrix))
for i, row := range matrix {
column[i] = row[columnIndex]
}
return column
}

// MinMax returns the minimum and maximum values from a slice of float64.
func MinMax(floats []float64) (float64, float64) {
min, max := math.Inf(1), math.Inf(-1)
for _, v := range floats {
if v < min {
min = v
}
if v > max {
max = v
}
}
return min, max
}

func RandomMatrix(rows, cols int) [][]float64 {
matrix := make([][]float64, rows)
for i := 0; i < rows; i++ {
matrix[i] = make([]float64, cols)
for j := 0; j < cols; j++ {
matrix[i][j] = rand.Float64()
}
}
return matrix
}

func AddScalar(matrix [][]float64, scalar float64) [][]float64 {
outputMatrix := make([][]float64, len(matrix))
for i, row := range matrix {
outputMatrix[i] = make([]float64, len(row))
for j := range row {
outputMatrix[i][j] = matrix[i][j] + scalar
}
}
return outputMatrix
}
Loading

0 comments on commit 1493234

Please sign in to comment.