Skip to content

Commit

Permalink
develop --> master for release 2.1.0 #72
Browse files Browse the repository at this point in the history
develop --> master for release 2.1.0
  • Loading branch information
remisultan authored Oct 11, 2021
2 parents e018bc0 + de6ebf3 commit 59add51
Show file tree
Hide file tree
Showing 17 changed files with 580,196 additions and 2 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package org.rsultan.example;

import java.io.IOException;
import org.nd4j.linalg.api.buffer.DataType;
import org.nd4j.linalg.factory.Nd4j;
import org.rsultan.core.clustering.ensemble.evaluation.TPRThresholdEvaluator;
import org.rsultan.core.clustering.ensemble.isolationforest.IsolationForest;
import org.rsultan.dataframe.Dataframes;
import org.rsultan.dataframe.TrainTestDataframe;

public class IsolationForestExample {

/*
You can use the http dataset --> args[0]
*/
static {
Nd4j.setDefaultDataTypes(DataType.DOUBLE, DataType.DOUBLE);
}

public static void main(String[] args) throws IOException {
var df = Dataframes.csv(args[0], ",", "\"", true);
var trainTestDataframe = Dataframes.trainTest(df.getColumns()).setSplitValue(0.5);

IsolationForest model = new IsolationForest(200).train(df.mapWithout("attack"));
var evaluator = new TPRThresholdEvaluator("attack", "anomalies").setDesiredTPR(0.9).setLearningRate(0.02);
Double threshold = evaluator.evaluate(model, trainTestDataframe);
System.out.println("threshold = " + threshold);
evaluator.showMetrics();
}
}
567,499 changes: 567,499 additions & 0 deletions java-ml-example/src/main/resources/http/http.csv

Large diffs are not rendered by default.

12,212 changes: 12,212 additions & 0 deletions java-ml-example/src/main/resources/http/http_reduced.csv

Large diffs are not rendered by default.

10 changes: 10 additions & 0 deletions java-ml/src/main/java/org/rsultan/core/Evaluator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package org.rsultan.core;

import org.rsultan.core.Trainable;
import org.rsultan.dataframe.TrainTestDataframe;

public interface Evaluator<V, T extends Trainable<T>> {

V evaluate(T trainable, TrainTestDataframe dataframe);

}
13 changes: 13 additions & 0 deletions java-ml/src/main/java/org/rsultan/core/RawTrainable.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package org.rsultan.core;

import java.io.Serializable;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.rsultan.dataframe.Dataframe;

public interface RawTrainable<T> extends Serializable {

T train(INDArray matrix);

INDArray predict(INDArray matrix);

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package org.rsultan.core.clustering.ensemble.domain;

import static java.util.Objects.nonNull;

import java.io.Serializable;
import org.nd4j.linalg.api.ndarray.INDArray;

public record IsolationNode(
int feature,
double featureThreshold,
INDArray data,
IsolationNode left,
IsolationNode right
) implements Serializable {

public IsolationNode(INDArray data) {
this(-1, -1, data, null, null);
}
public IsolationNode(
int feature,
double featureThreshold,
IsolationNode left,
IsolationNode right) {
this(feature, featureThreshold, null, left, right);
}

public boolean isLeaf() {
return nonNull(data);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
package org.rsultan.core.clustering.ensemble.evaluation;

import org.rsultan.core.Evaluator;
import org.rsultan.core.clustering.ensemble.isolationforest.IsolationForest;
import org.rsultan.dataframe.Dataframes;
import org.rsultan.dataframe.Row;
import org.rsultan.dataframe.TrainTestDataframe;

public class TPRThresholdEvaluator implements Evaluator<Double, IsolationForest> {

private final String responseVariable;
private final String predictionColumn;
private double desiredTPR = 0.9;
private double learningRate = 0.01;
private double TPR = 0;
private double FPR = 0;

public TPRThresholdEvaluator(String responseVariable, String predictionColumn) {
this.responseVariable = responseVariable;
this.predictionColumn = predictionColumn;
}

@Override
public Double evaluate(IsolationForest trainable, TrainTestDataframe dataframe) {
var dfSplit = dataframe.shuffle().split();
double threshold = 1;
while (threshold > 0 && TPR <= desiredTPR) {
threshold -= learningRate;
var trained = trainable.setAnomalyThreshold(threshold)
.train(dfSplit.train().mapWithout(responseVariable));
var responses = dfSplit.test().<Long>get(responseVariable);
var predictions = trained.predict(dfSplit.test().mapWithout(responseVariable))
.<Long>get(predictionColumn);

double truePositives = 0;
double trueNegatives = 0;
double falsePositives = 0;
double falseNegative = 0;

for (int i = 0; i < responses.size(); i++) {
var response = responses.get(i);
var prediction = predictions.get(i);
truePositives += response == 1L && prediction == 1L ? 1L : 0L;
trueNegatives += response == 0L && prediction == 0L ? 1L : 0L;
falsePositives += response == 0L && prediction == 1L ? 1L : 0L;
falseNegative += response == 1L && prediction == 0L ? 1L : 0L;
}
TPR = truePositives / (truePositives + falseNegative);
TPR = Double.isNaN(TPR) ? 0 : TPR;

FPR = falsePositives / (falsePositives + trueNegatives);
FPR = Double.isNaN(FPR) ? 0L : FPR;
}
if (threshold < 0) {
throw new IllegalArgumentException("Cannot have desired TPR");
}

return threshold;
}

public TPRThresholdEvaluator setDesiredTPR(double desiredTPR) {
this.desiredTPR = desiredTPR;
return this;
}

public TPRThresholdEvaluator setLearningRate(double learningRate) {
this.learningRate = learningRate;
return this;
}

public void showMetrics() {
Dataframes.create(new String[]{"TPR", "FPR"}, new Row(TPR, FPR)).tail();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
package org.rsultan.core.clustering.ensemble.isolationforest;

import static java.util.stream.IntStream.range;
import static org.rsultan.core.clustering.ensemble.isolationforest.utils.ScoreUtils.averagePathLength;

import java.util.List;
import java.util.stream.DoubleStream;
import org.apache.commons.lang3.RandomUtils;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.ops.transforms.Transforms;
import org.rsultan.core.Trainable;
import org.rsultan.dataframe.Column;
import org.rsultan.dataframe.Dataframe;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class IsolationForest implements Trainable<IsolationForest> {

private static final Logger LOG = LoggerFactory.getLogger(IsolationTree.class);
private final int nbTrees;
private double anomalyThreshold = 0.5;
private List<IsolationTree> isolationTrees;
private int sampleSize = 256;

public IsolationForest(int nbTrees) {
this.nbTrees = nbTrees;
}

public IsolationForest setSampleSize(int sampleSize) {
this.sampleSize = sampleSize;
return this;
}

public IsolationForest setAnomalyThreshold(double anomalyThreshold) {
this.anomalyThreshold = anomalyThreshold;
return this;
}

@Override
public IsolationForest train(Dataframe dataframe) {
var matrix = dataframe.toMatrix();
int realSample = sampleSize >= matrix.rows() ? sampleSize / 10 : sampleSize;
int treeDepth = (int) Math.ceil(Math.log(realSample) / Math.log(2));
isolationTrees = range(0, nbTrees).parallel()
.peek(i -> LOG.info("Tree number: {}", i))
.mapToObj(i -> range(0, realSample)
.map(idx -> RandomUtils.nextInt(0, matrix.rows()))
.toArray()).map(matrix::getRows)
.map(m -> new IsolationTree(treeDepth).train(m))
.toList();
return this;
}

@Override
public Dataframe predict(Dataframe dataframe) {
var matrix = dataframe.toMatrix();
var anomalyScores = computeAnomalyScore(matrix);
var isAnomaly = new Column<>("anomalies", DoubleStream.of(
anomalyScores.toDoubleVector()
).mapToObj(score -> score >= anomalyThreshold ? 1L : 0L).toArray());
return dataframe.addColumn(isAnomaly);
}

private INDArray computeAnomalyScore(INDArray matrix) {
var pathLengths = isolationTrees.stream().parallel().map(tree -> {
LOG.info("Compute paths for tree {}", isolationTrees.indexOf(tree) + 1);
return tree.predict(matrix);
}).toList();
int[] shape = {pathLengths.size(), pathLengths.get(0).columns()};
var avgLength = Nd4j.create(pathLengths, shape).mean(true, 0);
var twos = Nd4j.ones(avgLength.shape()).mul(2D);
return Transforms.pow(twos, avgLength.neg().div(averagePathLength(sampleSize)));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
package org.rsultan.core.clustering.ensemble.isolationforest;

import static java.util.stream.IntStream.range;
import static org.apache.commons.lang3.RandomUtils.nextDouble;
import static org.apache.commons.lang3.RandomUtils.nextInt;
import static org.rsultan.core.clustering.ensemble.isolationforest.utils.ScoreUtils.averagePathLength;

import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.factory.Nd4j;
import org.rsultan.core.RawTrainable;
import org.rsultan.core.clustering.ensemble.domain.IsolationNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class IsolationTree implements RawTrainable<IsolationTree> {

public static final Logger LOG = LoggerFactory.getLogger(IsolationTree.class);
private final int treeDepthLimit;
private IsolationNode tree;

public IsolationTree(int treeDepthLimit) {
this.treeDepthLimit = treeDepthLimit;
}

@Override
public IsolationTree train(INDArray matrix) {
this.tree = buildTree(matrix, treeDepthLimit);
return this;
}

private IsolationNode buildTree(INDArray matrix, int currentDepth) {
LOG.info("Tree Depth {}", currentDepth);
if (currentDepth <= 0 || matrix.rows() <= 2) {
return new IsolationNode(matrix);
}
int numberOfFeatures = matrix.columns();
int splitFeature = nextInt(0, numberOfFeatures);
var feature = matrix.getColumn(splitFeature);
double startInclusive = feature.minNumber().doubleValue();
double endInclusive = feature.maxNumber().doubleValue();

double valueSplit =
getValueSplit(startInclusive, endInclusive);

var leftIndices = range(0, feature.columns()).parallel()
.filter(idx -> feature.getDouble(idx) < valueSplit)
.toArray();
var left = matrix.getRows(leftIndices);

var rightIndices = range(0, feature.columns()).parallel()
.filter(idx -> feature.getDouble(idx) > valueSplit)
.toArray();
var right = matrix.getRows(rightIndices);

return new IsolationNode(
splitFeature,
valueSplit,
buildTree(left, currentDepth - 1),
buildTree(right, currentDepth - 1)
);
}

private double getValueSplit(double startInclusive, double endInclusive) {
if (startInclusive < 0 && endInclusive < 0) {
return -nextDouble(endInclusive * -1, startInclusive * -1);
} else if (startInclusive < 0 && endInclusive >= 0) {
return nextDouble(0, endInclusive + startInclusive * -1) + startInclusive;
}
return nextDouble(startInclusive, endInclusive);
}

@Override
public INDArray predict(INDArray matrix) {
var pathLengths = Nd4j.zeros(1, matrix.rows());
for (int i = 0; i < matrix.rows(); i++) {
var row = matrix.getRow(i);
var node = tree;
int length = 0;
while (!node.isLeaf()) {
node = row.getDouble(node.feature()) < node.featureThreshold() ? node.left() : node.right();
length++;
}
int leafSize = node.data().rows();
pathLengths.put(0, i, length + averagePathLength(leafSize));
}
return pathLengths;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package org.rsultan.core.clustering.ensemble.isolationforest.utils;

public class ScoreUtils {

private static final double EULER_CONSTANT = 0.5772156649;

public static double averagePathLength(double leafSize) {
if (leafSize > 2) {
return 2 * harmonicNumber(leafSize) - (2 * (leafSize - 1) / leafSize);
}
if (leafSize == 2) {
return 1;
}
return 0;
}

private static double harmonicNumber(double leafSize) {
return Math.log(leafSize - 1) + EULER_CONSTANT;
}
}
5 changes: 5 additions & 0 deletions java-ml/src/main/java/org/rsultan/dataframe/Dataframe.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import java.util.function.Supplier;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.rsultan.dataframe.printer.DataframePrinter;
import org.rsultan.dataframe.printer.DataframeWriter;
import org.rsultan.dataframe.transform.filter.FilterDataframe;
import org.rsultan.dataframe.transform.filter.FilterTransform;
import org.rsultan.dataframe.transform.map.MapDataframe;
Expand Down Expand Up @@ -137,6 +138,10 @@ public void show(int start, int end) {
DataframePrinter.create(data).print(max(0, start), min(end, this.rowSize));
}

public void write(String filename, String separator, String enclosure) {
DataframeWriter.write(this, filename, separator, enclosure);
}

public void tail() {
show(this.rowSize - 10, this.rowSize);
}
Expand Down
Loading

0 comments on commit 59add51

Please sign in to comment.