-
-
Notifications
You must be signed in to change notification settings - Fork 300
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
31069e5
commit 1493234
Showing
11 changed files
with
597 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,175 @@ | ||
package iforest | ||
|
||
import ( | ||
"math" | ||
"math/rand" | ||
"sync" | ||
) | ||
|
||
const ( | ||
defaultNumTrees = 100 | ||
defaultSampleSize = 256 | ||
defaultScoreThreshold = 0.6 | ||
defaultDetectionType = DetectionTypeThreshold | ||
offset = 0.5 | ||
) | ||
|
||
type DetectionType string | ||
|
||
const ( | ||
DetectionTypeThreshold DetectionType = "threshold" | ||
DetectionTypeProportion DetectionType = "proportion" | ||
) | ||
|
||
type Options struct { | ||
DetectionType DetectionType `json:"detectionType"` | ||
Threshold float64 `json:"threshold"` | ||
Proportion float64 `json:"proportion"` | ||
NumTrees int `json:"numTrees"` | ||
SampleSize int `json:"sampleSize"` | ||
MaxDepth int `json:"maxDepth"` | ||
} | ||
|
||
// SetDefaultValues initializes zero-valued fields in Options with defaults. | ||
func (o *Options) SetDefaultValues() { | ||
if o.DetectionType == "" { | ||
o.DetectionType = defaultDetectionType | ||
} | ||
|
||
if o.Threshold == 0 { | ||
o.Threshold = defaultScoreThreshold | ||
} | ||
|
||
if o.NumTrees == 0 { | ||
o.NumTrees = defaultNumTrees | ||
} | ||
|
||
if o.SampleSize == 0 { | ||
o.SampleSize = defaultSampleSize | ||
} | ||
|
||
if o.MaxDepth == 0 { | ||
o.MaxDepth = int(math.Ceil(math.Log2(float64(o.SampleSize)))) | ||
} | ||
} | ||
|
||
// IsolationForest is an anomaly detection model using isolation trees. | ||
type IsolationForest struct { | ||
*Options | ||
|
||
Trees []*TreeNode | ||
} | ||
|
||
// New creates an IsolationForest with default options. | ||
func New() *IsolationForest { | ||
options := &Options{} | ||
options.SetDefaultValues() | ||
return &IsolationForest{Options: options} | ||
} | ||
|
||
// NewWithOptions creates an IsolationForest with the specified options. | ||
func NewWithOptions(options Options) *IsolationForest { | ||
options.SetDefaultValues() | ||
return &IsolationForest{Options: &options} | ||
} | ||
|
||
// Fit builds isolation trees from the training samples. | ||
func (f *IsolationForest) Fit(samples [][]float64) { | ||
wg := sync.WaitGroup{} | ||
wg.Add(f.NumTrees) | ||
|
||
f.Trees = make([]*TreeNode, f.NumTrees) | ||
for i := 0; i < f.NumTrees; i++ { | ||
sampled := SampleRows(samples, f.SampleSize) | ||
go func(index int) { | ||
defer wg.Done() | ||
tree := f.BuildTree(sampled, 0) | ||
f.Trees[index] = tree | ||
}(i) | ||
} | ||
wg.Wait() | ||
} | ||
|
||
// BuildTree recursively splits the data to isolate outliers. | ||
func (f *IsolationForest) BuildTree(samples [][]float64, depth int) *TreeNode { | ||
numSamples := len(samples) | ||
if numSamples == 0 { | ||
return &TreeNode{} | ||
} | ||
numFeatures := len(samples[0]) | ||
if depth >= f.MaxDepth || numSamples <= 1 { | ||
return &TreeNode{Size: numSamples} | ||
} | ||
|
||
splitIndex := rand.Intn(numFeatures) | ||
column := Column(samples, splitIndex) | ||
minValue, maxValue := MinMax(column) | ||
splitValue := rand.Float64()*(maxValue-minValue) + minValue | ||
|
||
leftSamples := make([][]float64, 0) | ||
rightSamples := make([][]float64, 0) | ||
for _, sample := range samples { | ||
if sample[splitIndex] < splitValue { | ||
leftSamples = append(leftSamples, sample) | ||
} else { | ||
rightSamples = append(rightSamples, sample) | ||
} | ||
} | ||
|
||
return &TreeNode{ | ||
Left: f.BuildTree(leftSamples, depth+1), | ||
Right: f.BuildTree(rightSamples, depth+1), | ||
SplitIndex: splitIndex, | ||
SplitValue: splitValue, | ||
} | ||
} | ||
|
||
// Score calculates anomaly scores for each sample based on path lengths. | ||
func (f *IsolationForest) Score(samples [][]float64) []float64 { | ||
scores := make([]float64, len(samples)) | ||
for i, sample := range samples { | ||
score := 0.0 | ||
for _, tree := range f.Trees { | ||
score += pathLength(sample, tree, 0) | ||
} | ||
scores[i] = math.Pow(2.0, -score/float64(len(f.Trees))/averagePathLength(float64(f.SampleSize))) | ||
} | ||
return scores | ||
} | ||
|
||
// Predict classifies samples as 1 (outlier) or 0 (normal) using the chosen detection type. | ||
func (f *IsolationForest) Predict(samples [][]float64) []int { | ||
predictions := make([]int, len(samples)) | ||
scores := f.Score(samples) | ||
|
||
var threshold float64 | ||
switch f.DetectionType { | ||
case DetectionTypeThreshold: | ||
threshold = f.Threshold | ||
case DetectionTypeProportion: | ||
threshold = Quantile(f.Score(samples), 1-f.Proportion) | ||
default: | ||
panic("Invalid detection type") | ||
} | ||
|
||
for i, score := range scores { | ||
if score >= threshold { | ||
predictions[i] = 1 | ||
} else { | ||
predictions[i] = 0 | ||
} | ||
} | ||
|
||
return predictions | ||
} | ||
|
||
// FeatureImportance sums up each feature's usage across all trees for a provided sample. | ||
func (f *IsolationForest) FeatureImportance(sample []float64) []int { | ||
importance := make([]int, len(sample)) | ||
for _, tree := range f.Trees { | ||
for i, value := range tree.FeatureImportance(sample) { | ||
importance[i] += value | ||
} | ||
} | ||
return importance | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
package iforest | ||
|
||
import ( | ||
"testing" | ||
) | ||
|
||
func TestIsolationForest(t *testing.T) { | ||
tests := []struct { | ||
features [][]float64 | ||
predictions []int | ||
}{ | ||
{ | ||
[][]float64{ | ||
{0, 0, 0}, | ||
{0, 0, 0}, | ||
{0, 0, 0}, | ||
{1, 1, 1}, | ||
}, | ||
[]int{0, 0, 0, 1}, | ||
}, | ||
} | ||
|
||
for _, tt := range tests { | ||
forest := New() | ||
forest.Fit(tt.features) | ||
|
||
preds := forest.Predict(tt.features) | ||
for i, pred := range preds { | ||
if pred != tt.predictions[i] { | ||
t.Errorf("expected %v, got %v", tt.predictions[i], pred) | ||
} | ||
} | ||
} | ||
} | ||
|
||
func TestIsolationForestOnRandomSamples(t *testing.T) { | ||
dim := 2 | ||
|
||
samples := RandomMatrix(1000, dim) | ||
|
||
forest := New() | ||
forest.Fit(samples) | ||
|
||
outliers := RandomMatrix(5, dim) | ||
outliers = AddScalar(outliers, 10) | ||
preds := forest.Predict(outliers) | ||
for _, pred := range preds { | ||
if pred != 1 { | ||
t.Errorf("expected 1, got %v", pred) | ||
} | ||
} | ||
|
||
preds = forest.Predict(SampleRows(samples, 5)) | ||
for _, pred := range preds { | ||
if pred != 0 { | ||
t.Errorf("expected 0, got %v", pred) | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
package iforest | ||
|
||
import ( | ||
"math" | ||
"math/rand" | ||
) | ||
|
||
// SampleRows randomly selects 'size' rows from the matrix. | ||
func SampleRows(matrix [][]float64, size int) [][]float64 { | ||
if size <= 0 { | ||
panic("size must be greater than 0") | ||
} | ||
|
||
if len(matrix) <= size { | ||
return matrix | ||
} | ||
|
||
perm := rand.Perm(len(matrix)) | ||
sampled := make([][]float64, size) | ||
for i := 0; i < size; i++ { | ||
sampled[i] = matrix[perm[i]] | ||
} | ||
return sampled | ||
} | ||
|
||
// Column returns a slice containing the specified column from the matrix. | ||
func Column(matrix [][]float64, columnIndex int) []float64 { | ||
column := make([]float64, len(matrix)) | ||
for i, row := range matrix { | ||
column[i] = row[columnIndex] | ||
} | ||
return column | ||
} | ||
|
||
// MinMax returns the minimum and maximum values from a slice of float64. | ||
func MinMax(floats []float64) (float64, float64) { | ||
min, max := math.Inf(1), math.Inf(-1) | ||
for _, v := range floats { | ||
if v < min { | ||
min = v | ||
} | ||
if v > max { | ||
max = v | ||
} | ||
} | ||
return min, max | ||
} | ||
|
||
func RandomMatrix(rows, cols int) [][]float64 { | ||
matrix := make([][]float64, rows) | ||
for i := 0; i < rows; i++ { | ||
matrix[i] = make([]float64, cols) | ||
for j := 0; j < cols; j++ { | ||
matrix[i][j] = rand.Float64() | ||
} | ||
} | ||
return matrix | ||
} | ||
|
||
func AddScalar(matrix [][]float64, scalar float64) [][]float64 { | ||
outputMatrix := make([][]float64, len(matrix)) | ||
for i, row := range matrix { | ||
outputMatrix[i] = make([]float64, len(row)) | ||
for j := range row { | ||
outputMatrix[i][j] = matrix[i][j] + scalar | ||
} | ||
} | ||
return outputMatrix | ||
} |
Oops, something went wrong.