Skip to content

Commit

Permalink
add isolation forest algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
narumiruna committed Jan 3, 2025
1 parent 31069e5 commit 7254820
Show file tree
Hide file tree
Showing 7 changed files with 398 additions and 0 deletions.
166 changes: 166 additions & 0 deletions pkg/ensemble/iforest/forest.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
package iforest

import (
"math"
"math/rand"
"sync"
)

const (
defaultNumTrees = 100
defaultSampleSize = 256
defaultScoreThreshold = 0.6
defaultDetectionType = DetectionTypeThreshold
offset = 0.5
)

type DetectionType string

const (
DetectionTypeThreshold DetectionType = "threshold"
DetectionTypeProportion DetectionType = "proportion"
)

type Options struct {
DetectionType DetectionType `json:"detectionType"`
Threshold float64 `json:"threshold"`
Proportion float64 `json:"proportion"`
NumTrees int `json:"numTrees"`
SampleSize int `json:"sampleSize"`
MaxDepth int `json:"maxDepth"`
}

func (o *Options) SetDefaultValues() {
if o.DetectionType == "" {
o.DetectionType = defaultDetectionType
}

if o.Threshold == 0 {
o.Threshold = defaultScoreThreshold
}

if o.NumTrees == 0 {
o.NumTrees = defaultNumTrees
}

if o.SampleSize == 0 {
o.SampleSize = defaultSampleSize
}

if o.MaxDepth == 0 {
o.MaxDepth = int(math.Ceil(math.Log2(float64(o.SampleSize))))
}
}

type IsolationForest struct {
*Options

Trees []*TreeNode
}

func New() *IsolationForest {
options := &Options{}
options.SetDefaultValues()
return &IsolationForest{Options: options}
}

func NewWithOptions(options Options) *IsolationForest {
options.SetDefaultValues()
return &IsolationForest{Options: &options}
}

func (f *IsolationForest) Fit(samples [][]float64) {
wg := sync.WaitGroup{}
wg.Add(f.NumTrees)

f.Trees = make([]*TreeNode, f.NumTrees)
for i := 0; i < f.NumTrees; i++ {
sampled := Sample(samples, f.SampleSize)
go func(index int) {
defer wg.Done()
tree := f.BuildTree(sampled, 0)
f.Trees[index] = tree
}(i)
}
wg.Wait()
}

func (f *IsolationForest) BuildTree(samples [][]float64, depth int) *TreeNode {
numSamples := len(samples)
if numSamples == 0 {
return &TreeNode{}
}
numFeatures := len(samples[0])
if depth >= f.MaxDepth || numSamples <= 1 {
return &TreeNode{Size: numSamples}
}

splitIndex := rand.Intn(numFeatures)
column := Column(samples, splitIndex)
minValue, maxValue := MinMax(column)
splitValue := rand.Float64()*(maxValue-minValue) + minValue

leftSamples := make([][]float64, 0)
rightSamples := make([][]float64, 0)
for _, vector := range samples {
if vector[splitIndex] < splitValue {
leftSamples = append(leftSamples, vector)
} else {
rightSamples = append(rightSamples, vector)
}
}

return &TreeNode{
Left: f.BuildTree(leftSamples, depth+1),
Right: f.BuildTree(rightSamples, depth+1),
SplitIndex: splitIndex,
SplitValue: splitValue,
}
}

func (f *IsolationForest) Score(samples [][]float64) []float64 {
scores := make([]float64, len(samples))
for i, sample := range samples {
score := 0.0
for _, tree := range f.Trees {
score += pathLength(sample, tree, 0)
}
scores[i] = math.Pow(2.0, -score/float64(len(f.Trees))/averagePathLength(float64(f.SampleSize)))
}
return scores
}

func (f *IsolationForest) Predict(samples [][]float64) []int {
predictions := make([]int, len(samples))
scores := f.Score(samples)

var threshold float64
switch f.DetectionType {
case DetectionTypeThreshold:
threshold = f.Threshold
case DetectionTypeProportion:
threshold = Quantile(f.Score(samples), 1-f.Proportion)
default:
panic("Invalid detection type")
}

for i, score := range scores {
if score >= threshold {
predictions[i] = 1
} else {
predictions[i] = 0
}
}

return predictions
}

func (f *IsolationForest) FeatureImportance(sample []float64) []int {
importance := make([]int, len(sample))
for _, tree := range f.Trees {
for i, value := range tree.FeatureImportance(sample) {
importance[i] += value
}
}
return importance
}
34 changes: 34 additions & 0 deletions pkg/ensemble/iforest/forest_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package iforest

import (
"testing"
)

func TestIsolationForest(t *testing.T) {
cases := []struct {
features [][]float64
predictions []int
}{
{
[][]float64{
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{1, 1, 1},
},
[]int{0, 0, 0, 1},
},
}

for _, c := range cases {
forest := New()
forest.Fit(c.features)

preds := forest.Predict(c.features)
for i, pred := range preds {
if pred != c.predictions[i] {
t.Errorf("expected %v, got %v", c.predictions[i], pred)
}
}
}
}
66 changes: 66 additions & 0 deletions pkg/ensemble/iforest/math.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package iforest

import (
"math"
"math/rand"
)

func Sample(matrix [][]float64, size int) [][]float64 {
if size <= 0 {
panic("size must be greater than 0")
}

if len(matrix) <= size {
return matrix
}

perm := rand.Perm(len(matrix))
sampled := make([][]float64, size)
for i := 0; i < size; i++ {
sampled[i] = matrix[perm[i]]
}
return sampled
}

func Column(matrix [][]float64, columnIndex int) []float64 {
column := make([]float64, len(matrix))
for i, row := range matrix {
column[i] = row[columnIndex]
}
return column
}

func MinMax(floats []float64) (float64, float64) {
min, max := math.Inf(1), math.Inf(-1)
for _, v := range floats {
if v < min {
min = v
}
if v > max {
max = v
}
}
return min, max
}

func RandomMatrix(rows, cols int) [][]float64 {
matrix := make([][]float64, rows)
for i := 0; i < rows; i++ {
matrix[i] = make([]float64, cols)
for j := 0; j < cols; j++ {
matrix[i][j] = rand.Float64()
}
}
return matrix
}

func AddScalar(matrix [][]float64, scalar float64) [][]float64 {
outputMatrix := make([][]float64, len(matrix))
for i, row := range matrix {
outputMatrix[i] = make([]float64, len(row))
for j := range row {
outputMatrix[i][j] = matrix[i][j] + scalar
}
}
return outputMatrix
}
33 changes: 33 additions & 0 deletions pkg/ensemble/iforest/path.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package iforest

import "math"

const eulerGamma = 0.5772156649

func harmonicNumber(x float64) float64 {
return math.Log(x) + eulerGamma
}

func averagePathLength(x float64) float64 {
if x > 2 {
return 2.0*harmonicNumber(x-1) - 2.0*(x-1)/x
} else if x == 2 {
return 1.0
} else {
return 0.0
}
}

func pathLength(vector []float64, node *TreeNode, currentPathLength int) float64 {
if node.IsLeaf() {
return float64(currentPathLength) + averagePathLength(float64(node.Size))
}

splitAttribute := node.SplitIndex
splitValue := node.SplitValue
if vector[splitAttribute] < splitValue {
return pathLength(vector, node.Left, currentPathLength+1)
} else {
return pathLength(vector, node.Right, currentPathLength+1)
}
}
34 changes: 34 additions & 0 deletions pkg/ensemble/iforest/quantile.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package iforest

import (
"fmt"
"math"
"sort"

"github.com/c9s/bbgo/pkg/datatype/floats"
)

func Quantile(numbers floats.Slice, q float64) float64 {
if len(numbers) == 0 {
panic("numbers must not be empty")
}
if q < 0 || q > 1 {
panic(fmt.Sprintf("q must be in [0, 1], got %v", q))
}

sortedNumbers := make(floats.Slice, len(numbers))
copy(sortedNumbers, numbers)
sort.Float64s(sortedNumbers)

n := float64(len(sortedNumbers))
pos := q * (n - 1)
lowerIndex := int(math.Floor(pos))
upperIndex := int(math.Ceil(pos))
if lowerIndex == upperIndex {
return sortedNumbers[lowerIndex]
}

// linear interpolation
fraction := pos - float64(lowerIndex)
return sortedNumbers[lowerIndex] + fraction*(sortedNumbers[upperIndex]-sortedNumbers[lowerIndex])
}
27 changes: 27 additions & 0 deletions pkg/ensemble/iforest/quantile_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package iforest

import (
"testing"

"github.com/c9s/bbgo/pkg/datatype/floats"
)

func TestQuantile(t *testing.T) {
cases := []struct {
numbers floats.Slice
q float64
expected float64
}{
{floats.Slice{1, 2}, 0.5, 1.5},
{floats.Slice{1, 2, 3, 4, 5}, 0.5, 3},
{floats.Slice{1, 2, 3, 4, 5}, 1.0, 5},
{floats.Slice{1, 2, 3, 4, 5}, 0.0, 1},
}

for _, c := range cases {
actual := Quantile(c.numbers, c.q)
if actual != c.expected {
t.Errorf("Quantile(%v, %v) == %v, expected %v", c.numbers, c.q, actual, c.expected)
}
}
}
Loading

0 comments on commit 7254820

Please sign in to comment.