add isolation forest algorithm

c9s · Jan 3, 2025 · 7254820 · 7254820
1 parent 31069e5
commit 7254820
Show file tree

Hide file tree

Showing 7 changed files with 398 additions and 0 deletions.
diff --git a/pkg/ensemble/iforest/forest.go b/pkg/ensemble/iforest/forest.go
@@ -0,0 +1,166 @@
+package iforest
+
+import (
+	"math"
+	"math/rand"
+	"sync"
+)
+
+const (
+	defaultNumTrees       = 100
+	defaultSampleSize     = 256
+	defaultScoreThreshold = 0.6
+	defaultDetectionType  = DetectionTypeThreshold
+	offset                = 0.5
+)
+
+type DetectionType string
+
+const (
+	DetectionTypeThreshold  DetectionType = "threshold"
+	DetectionTypeProportion DetectionType = "proportion"
+)
+
+type Options struct {
+	DetectionType DetectionType `json:"detectionType"`
+	Threshold     float64       `json:"threshold"`
+	Proportion    float64       `json:"proportion"`
+	NumTrees      int           `json:"numTrees"`
+	SampleSize    int           `json:"sampleSize"`
+	MaxDepth      int           `json:"maxDepth"`
+}
+
+func (o *Options) SetDefaultValues() {
+	if o.DetectionType == "" {
+		o.DetectionType = defaultDetectionType
+	}
+
+	if o.Threshold == 0 {
+		o.Threshold = defaultScoreThreshold
+	}
+
+	if o.NumTrees == 0 {
+		o.NumTrees = defaultNumTrees
+	}
+
+	if o.SampleSize == 0 {
+		o.SampleSize = defaultSampleSize
+	}
+
+	if o.MaxDepth == 0 {
+		o.MaxDepth = int(math.Ceil(math.Log2(float64(o.SampleSize))))
+	}
+}
+
+type IsolationForest struct {
+	*Options
+
+	Trees []*TreeNode
+}
+
+func New() *IsolationForest {
+	options := &Options{}
+	options.SetDefaultValues()
+	return &IsolationForest{Options: options}
+}
+
+func NewWithOptions(options Options) *IsolationForest {
+	options.SetDefaultValues()
+	return &IsolationForest{Options: &options}
+}
+
+func (f *IsolationForest) Fit(samples [][]float64) {
+	wg := sync.WaitGroup{}
+	wg.Add(f.NumTrees)
+
+	f.Trees = make([]*TreeNode, f.NumTrees)
+	for i := 0; i < f.NumTrees; i++ {
+		sampled := Sample(samples, f.SampleSize)
+		go func(index int) {
+			defer wg.Done()
+			tree := f.BuildTree(sampled, 0)
+			f.Trees[index] = tree
+		}(i)
+	}
+	wg.Wait()
+}
+
+func (f *IsolationForest) BuildTree(samples [][]float64, depth int) *TreeNode {
+	numSamples := len(samples)
+	if numSamples == 0 {
+		return &TreeNode{}
+	}
+	numFeatures := len(samples[0])
+	if depth >= f.MaxDepth || numSamples <= 1 {
+		return &TreeNode{Size: numSamples}
+	}
+
+	splitIndex := rand.Intn(numFeatures)
+	column := Column(samples, splitIndex)
+	minValue, maxValue := MinMax(column)
+	splitValue := rand.Float64()*(maxValue-minValue) + minValue
+
+	leftSamples := make([][]float64, 0)
+	rightSamples := make([][]float64, 0)
+	for _, vector := range samples {
+		if vector[splitIndex] < splitValue {
+			leftSamples = append(leftSamples, vector)
+		} else {
+			rightSamples = append(rightSamples, vector)
+		}
+	}
+
+	return &TreeNode{
+		Left:       f.BuildTree(leftSamples, depth+1),
+		Right:      f.BuildTree(rightSamples, depth+1),
+		SplitIndex: splitIndex,
+		SplitValue: splitValue,
+	}
+}
+
+func (f *IsolationForest) Score(samples [][]float64) []float64 {
+	scores := make([]float64, len(samples))
+	for i, sample := range samples {
+		score := 0.0
+		for _, tree := range f.Trees {
+			score += pathLength(sample, tree, 0)
+		}
+		scores[i] = math.Pow(2.0, -score/float64(len(f.Trees))/averagePathLength(float64(f.SampleSize)))
+	}
+	return scores
+}
+
+func (f *IsolationForest) Predict(samples [][]float64) []int {
+	predictions := make([]int, len(samples))
+	scores := f.Score(samples)
+
+	var threshold float64
+	switch f.DetectionType {
+	case DetectionTypeThreshold:
+		threshold = f.Threshold
+	case DetectionTypeProportion:
+		threshold = Quantile(f.Score(samples), 1-f.Proportion)
+	default:
+		panic("Invalid detection type")
+	}
+
+	for i, score := range scores {
+		if score >= threshold {
+			predictions[i] = 1
+		} else {
+			predictions[i] = 0
+		}
+	}
+
+	return predictions
+}
+
+func (f *IsolationForest) FeatureImportance(sample []float64) []int {
+	importance := make([]int, len(sample))
+	for _, tree := range f.Trees {
+		for i, value := range tree.FeatureImportance(sample) {
+			importance[i] += value
+		}
+	}
+	return importance
+}
diff --git a/pkg/ensemble/iforest/forest_test.go b/pkg/ensemble/iforest/forest_test.go
@@ -0,0 +1,34 @@
+package iforest
+
+import (
+	"testing"
+)
+
+func TestIsolationForest(t *testing.T) {
+	cases := []struct {
+		features    [][]float64
+		predictions []int
+	}{
+		{
+			[][]float64{
+				{0, 0, 0},
+				{0, 0, 0},
+				{0, 0, 0},
+				{1, 1, 1},
+			},
+			[]int{0, 0, 0, 1},
+		},
+	}
+
+	for _, c := range cases {
+		forest := New()
+		forest.Fit(c.features)
+
+		preds := forest.Predict(c.features)
+		for i, pred := range preds {
+			if pred != c.predictions[i] {
+				t.Errorf("expected %v, got %v", c.predictions[i], pred)
+			}
+		}
+	}
+}
diff --git a/pkg/ensemble/iforest/math.go b/pkg/ensemble/iforest/math.go
@@ -0,0 +1,66 @@
+package iforest
+
+import (
+	"math"
+	"math/rand"
+)
+
+func Sample(matrix [][]float64, size int) [][]float64 {
+	if size <= 0 {
+		panic("size must be greater than 0")
+	}
+
+	if len(matrix) <= size {
+		return matrix
+	}
+
+	perm := rand.Perm(len(matrix))
+	sampled := make([][]float64, size)
+	for i := 0; i < size; i++ {
+		sampled[i] = matrix[perm[i]]
+	}
+	return sampled
+}
+
+func Column(matrix [][]float64, columnIndex int) []float64 {
+	column := make([]float64, len(matrix))
+	for i, row := range matrix {
+		column[i] = row[columnIndex]
+	}
+	return column
+}
+
+func MinMax(floats []float64) (float64, float64) {
+	min, max := math.Inf(1), math.Inf(-1)
+	for _, v := range floats {
+		if v < min {
+			min = v
+		}
+		if v > max {
+			max = v
+		}
+	}
+	return min, max
+}
+
+func RandomMatrix(rows, cols int) [][]float64 {
+	matrix := make([][]float64, rows)
+	for i := 0; i < rows; i++ {
+		matrix[i] = make([]float64, cols)
+		for j := 0; j < cols; j++ {
+			matrix[i][j] = rand.Float64()
+		}
+	}
+	return matrix
+}
+
+func AddScalar(matrix [][]float64, scalar float64) [][]float64 {
+	outputMatrix := make([][]float64, len(matrix))
+	for i, row := range matrix {
+		outputMatrix[i] = make([]float64, len(row))
+		for j := range row {
+			outputMatrix[i][j] = matrix[i][j] + scalar
+		}
+	}
+	return outputMatrix
+}
diff --git a/pkg/ensemble/iforest/path.go b/pkg/ensemble/iforest/path.go
@@ -0,0 +1,33 @@
+package iforest
+
+import "math"
+
+const eulerGamma = 0.5772156649
+
+func harmonicNumber(x float64) float64 {
+	return math.Log(x) + eulerGamma
+}
+
+func averagePathLength(x float64) float64 {
+	if x > 2 {
+		return 2.0*harmonicNumber(x-1) - 2.0*(x-1)/x
+	} else if x == 2 {
+		return 1.0
+	} else {
+		return 0.0
+	}
+}
+
+func pathLength(vector []float64, node *TreeNode, currentPathLength int) float64 {
+	if node.IsLeaf() {
+		return float64(currentPathLength) + averagePathLength(float64(node.Size))
+	}
+
+	splitAttribute := node.SplitIndex
+	splitValue := node.SplitValue
+	if vector[splitAttribute] < splitValue {
+		return pathLength(vector, node.Left, currentPathLength+1)
+	} else {
+		return pathLength(vector, node.Right, currentPathLength+1)
+	}
+}
diff --git a/pkg/ensemble/iforest/quantile.go b/pkg/ensemble/iforest/quantile.go
@@ -0,0 +1,34 @@
+package iforest
+
+import (
+	"fmt"
+	"math"
+	"sort"
+
+	"github.com/c9s/bbgo/pkg/datatype/floats"
+)
+
+func Quantile(numbers floats.Slice, q float64) float64 {
+	if len(numbers) == 0 {
+		panic("numbers must not be empty")
+	}
+	if q < 0 || q > 1 {
+		panic(fmt.Sprintf("q must be in [0, 1], got %v", q))
+	}
+
+	sortedNumbers := make(floats.Slice, len(numbers))
+	copy(sortedNumbers, numbers)
+	sort.Float64s(sortedNumbers)
+
+	n := float64(len(sortedNumbers))
+	pos := q * (n - 1)
+	lowerIndex := int(math.Floor(pos))
+	upperIndex := int(math.Ceil(pos))
+	if lowerIndex == upperIndex {
+		return sortedNumbers[lowerIndex]
+	}
+
+	// linear interpolation
+	fraction := pos - float64(lowerIndex)
+	return sortedNumbers[lowerIndex] + fraction*(sortedNumbers[upperIndex]-sortedNumbers[lowerIndex])
+}
diff --git a/pkg/ensemble/iforest/quantile_test.go b/pkg/ensemble/iforest/quantile_test.go
@@ -0,0 +1,27 @@
+package iforest
+
+import (
+	"testing"
+
+	"github.com/c9s/bbgo/pkg/datatype/floats"
+)
+
+func TestQuantile(t *testing.T) {
+	cases := []struct {
+		numbers  floats.Slice
+		q        float64
+		expected float64
+	}{
+		{floats.Slice{1, 2}, 0.5, 1.5},
+		{floats.Slice{1, 2, 3, 4, 5}, 0.5, 3},
+		{floats.Slice{1, 2, 3, 4, 5}, 1.0, 5},
+		{floats.Slice{1, 2, 3, 4, 5}, 0.0, 1},
+	}
+
+	for _, c := range cases {
+		actual := Quantile(c.numbers, c.q)
+		if actual != c.expected {
+			t.Errorf("Quantile(%v, %v) == %v, expected %v", c.numbers, c.q, actual, c.expected)
+		}
+	}
+}