From 36218ccd5d8f1611c6f065b55b5c68d8f8b6f2f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Sultan?= Date: Sun, 31 Oct 2021 18:23:41 +0100 Subject: [PATCH 1/3] develop --> master for release 2.1.3 (#79) * chore: provide sources in build (#76) * refactor: protected fields for isolation forests (#77) * refactor: make PCA raw trainable (#78) --- .../isolationforest/IsolationForest.java | 19 +- .../isolationforest/IsolationTree.java | 11 +- .../dimred/PrincipalComponentAnalysis.java | 56 ++-- pom.xml | 266 +++++++++--------- 4 files changed, 196 insertions(+), 156 deletions(-) diff --git a/java-ml/src/main/java/org/rsultan/core/clustering/ensemble/isolationforest/IsolationForest.java b/java-ml/src/main/java/org/rsultan/core/clustering/ensemble/isolationforest/IsolationForest.java index 6be020a..9d9fa2a 100644 --- a/java-ml/src/main/java/org/rsultan/core/clustering/ensemble/isolationforest/IsolationForest.java +++ b/java-ml/src/main/java/org/rsultan/core/clustering/ensemble/isolationforest/IsolationForest.java @@ -5,9 +5,12 @@ import java.util.List; import java.util.stream.DoubleStream; +import java.util.stream.LongStream; import org.apache.commons.lang3.RandomUtils; import org.nd4j.linalg.api.ndarray.INDArray; import org.nd4j.linalg.factory.Nd4j; +import org.nd4j.linalg.indexing.INDArrayIndex; +import org.nd4j.linalg.indexing.NDArrayIndex; import org.nd4j.linalg.ops.transforms.Transforms; import org.rsultan.core.Trainable; import org.rsultan.dataframe.Column; @@ -18,10 +21,10 @@ public class IsolationForest implements Trainable { private static final Logger LOG = LoggerFactory.getLogger(IsolationTree.class); - private final int nbTrees; - private double anomalyThreshold = 0.5; - private List isolationTrees; - private int sampleSize = 256; + protected final int nbTrees; + protected double anomalyThreshold = 0.5; + protected List isolationTrees; + protected int sampleSize = 256; public IsolationForest(int nbTrees) { this.nbTrees = nbTrees; @@ -44,9 +47,11 @@ public IsolationForest train(Dataframe dataframe) { int treeDepth = (int) Math.ceil(Math.log(realSample) / Math.log(2)); isolationTrees = range(0, nbTrees).parallel() .peek(i -> LOG.info("Tree number: {}", i)) - .mapToObj(i -> range(0, realSample) - .map(idx -> RandomUtils.nextInt(0, matrix.rows())) - .toArray()).map(matrix::getRows) + .mapToObj(i -> LongStream.range(0, realSample) + .map(idx -> RandomUtils.nextLong(0, matrix.rows())) + .toArray()) + .map(NDArrayIndex::indices) + .map(matrix::get) .map(m -> new IsolationTree(treeDepth).train(m)) .toList(); return this; diff --git a/java-ml/src/main/java/org/rsultan/core/clustering/ensemble/isolationforest/IsolationTree.java b/java-ml/src/main/java/org/rsultan/core/clustering/ensemble/isolationforest/IsolationTree.java index 4882ec6..b1801c8 100644 --- a/java-ml/src/main/java/org/rsultan/core/clustering/ensemble/isolationforest/IsolationTree.java +++ b/java-ml/src/main/java/org/rsultan/core/clustering/ensemble/isolationforest/IsolationTree.java @@ -1,12 +1,13 @@ package org.rsultan.core.clustering.ensemble.isolationforest; -import static java.util.stream.IntStream.range; +import static java.util.stream.LongStream.range; import static org.apache.commons.lang3.RandomUtils.nextDouble; import static org.apache.commons.lang3.RandomUtils.nextInt; import static org.rsultan.core.clustering.ensemble.isolationforest.utils.ScoreUtils.averagePathLength; import org.nd4j.linalg.api.ndarray.INDArray; import org.nd4j.linalg.factory.Nd4j; +import org.nd4j.linalg.indexing.NDArrayIndex; import org.rsultan.core.RawTrainable; import org.rsultan.core.clustering.ensemble.domain.IsolationNode; import org.slf4j.Logger; @@ -45,12 +46,12 @@ private IsolationNode buildTree(INDArray matrix, int currentDepth) { var leftIndices = range(0, feature.columns()).parallel() .filter(idx -> feature.getDouble(idx) < valueSplit) .toArray(); - var left = matrix.getRows(leftIndices); + var left = getVector(matrix, leftIndices); var rightIndices = range(0, feature.columns()).parallel() .filter(idx -> feature.getDouble(idx) > valueSplit) .toArray(); - var right = matrix.getRows(rightIndices); + var right = getVector(matrix, rightIndices); return new IsolationNode( splitFeature, @@ -60,6 +61,10 @@ private IsolationNode buildTree(INDArray matrix, int currentDepth) { ); } + private INDArray getVector(INDArray matrix, long[] indices) { + return matrix.get(NDArrayIndex.indices(indices)); + } + private double getValueSplit(double startInclusive, double endInclusive) { if (startInclusive < 0 && endInclusive < 0) { return -nextDouble(endInclusive * -1, startInclusive * -1); diff --git a/java-ml/src/main/java/org/rsultan/core/dimred/PrincipalComponentAnalysis.java b/java-ml/src/main/java/org/rsultan/core/dimred/PrincipalComponentAnalysis.java index ab47a04..8b183d8 100644 --- a/java-ml/src/main/java/org/rsultan/core/dimred/PrincipalComponentAnalysis.java +++ b/java-ml/src/main/java/org/rsultan/core/dimred/PrincipalComponentAnalysis.java @@ -6,8 +6,8 @@ import static org.nd4j.linalg.eigen.Eigen.symmetricGeneralizedEigenvalues; import java.util.List; -import org.nd4j.common.util.ArrayUtil; import org.nd4j.linalg.api.ndarray.INDArray; +import org.rsultan.core.RawTrainable; import org.rsultan.core.Trainable; import org.rsultan.dataframe.Column; import org.rsultan.dataframe.Dataframe; @@ -16,7 +16,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class PrincipalComponentAnalysis implements Trainable { +public class PrincipalComponentAnalysis implements + Trainable, RawTrainable { private static final Logger LOG = LoggerFactory.getLogger(PrincipalComponentAnalysis.class); @@ -35,26 +36,14 @@ public PrincipalComponentAnalysis(int numberOfComponents) { public PrincipalComponentAnalysis train(Dataframe dataframe) { var X = dataframe.mapWithout(responseVariable).toMatrix(); this.responseVariableData = dataframe.get(responseVariable); - int components = Math.min(numberOfComponent, X.columns()); - Xmean = X.mean(0); - X = X.sub(Xmean); - LOG.info("computing covariance matrix"); - LOG.info("computing eighenvectors"); - eighenVectors = Matrices.covariance(X); - var eighenValuesArgSort = argsort( - symmetricGeneralizedEigenvalues(eighenVectors, true).toIntVector(), false - ); - eighenVectors = eighenVectors - .getColumns(eighenValuesArgSort) - .getColumns(range(0, components).toArray()); - return this; + return this.train(X); } @Override public Dataframe predict(Dataframe dataframe) { - var Xpredict = dataframe.mapWithout(responseVariable).toMatrix().sub(Xmean); + var Xpredict = dataframe.mapWithout(responseVariable).toMatrix(); LOG.info("computing predictions"); - predictions = eighenVectors.transpose().mmul(Xpredict.transpose()).transpose(); + this.predict(Xpredict); List> columns = range(0, predictions.columns()) .mapToObj(colIdx -> new Column<>("c" + colIdx, range(0, predictions.rows()) .mapToObj(rowIdx -> predictions.getDouble(rowIdx, colIdx)) @@ -64,9 +53,27 @@ public Dataframe predict(Dataframe dataframe) { return Dataframes.create(columns.toArray(Column[]::new)); } + @Override + public PrincipalComponentAnalysis train(INDArray X) { + int components = Math.min(numberOfComponent, X.columns()); + Xmean = X.mean(0); + X = X.sub(Xmean); + LOG.info("computing covariance matrix"); + eighenVectors = Matrices.covariance(X); + LOG.info("computing eighenvectors"); + var eighenValuesArgSort = argsort( + symmetricGeneralizedEigenvalues(eighenVectors, true).toIntVector(), false + ); + eighenVectors = eighenVectors + .getColumns(eighenValuesArgSort) + .getColumns(range(0, components).toArray()); + LOG.info("eighenvectors computed"); + return this; + } + public Dataframe reconstruct() { - LOG.info("reconstructing original matrix"); - var XreBuilt = predictions.mmul(eighenVectors.transpose()).add(Xmean); + LOG.info("trying to reconstruct original matrix"); + var XreBuilt = rawReconstruct(); List> columns = range(0, XreBuilt.columns()) .mapToObj(colIdx -> new Column<>("c" + colIdx, range(0, XreBuilt.rows()) .mapToObj(rowIdx -> XreBuilt.getDouble(rowIdx, colIdx)) @@ -76,8 +83,19 @@ public Dataframe reconstruct() { return Dataframes.create(columns.toArray(Column[]::new)); } + public INDArray rawReconstruct() { + return predictions.mmul(eighenVectors.transpose()).add(Xmean); + } + + @Override + public INDArray predict(INDArray matrix) { + predictions = eighenVectors.transpose().mmul(matrix.sub(Xmean).transpose()).transpose(); + return predictions; + } + public PrincipalComponentAnalysis setResponseVariable(String responseVariable) { this.responseVariable = responseVariable; return this; } + } diff --git a/pom.xml b/pom.xml index 93a04e5..807b1f5 100644 --- a/pom.xml +++ b/pom.xml @@ -1,128 +1,140 @@ - - - - 4.0.0 - org.rsultan - java-ml-parent - 2.1.3-SNAPSHOT - java-ml-parent - pom - - - - release.archiva.rsultan.org - https://archiva.rsultan.org/repository/internal - - - - - 3.9.1 - 1.0 - 16 - 5.7.0 - ${java.version} - ${java.version} - UTF-8 - 2.0.0-alpha1 - 2.9.0 - - - - java-ml - java-ml-example - - - - - org.slf4j - slf4j-log4j12 - ${slf4j-log4j12.version} - - - org.slf4j - slf4j-api - ${slf4j-log4j12.version} - - - org.junit.jupiter - junit-jupiter - ${junit-jupiter.version} - test - - - org.assertj - assertj-core - ${assertj-core.version} - test - - - org.junit.jupiter - junit-jupiter-params - 5.7.0 - compile - - - - - java-ml - - - - maven-clean-plugin - 3.1.0 - - - maven-resources-plugin - 3.0.2 - - - maven-compiler-plugin - 3.8.0 - - - maven-surefire-plugin - 3.0.0-M5 - - - 1 - 0 - - - - - maven-jar-plugin - 3.0.2 - - - maven-install-plugin - 2.5.2 - - - maven-deploy-plugin - 2.8.2 - - - maven-site-plugin - 3.7.1 - - - maven-project-info-reports-plugin - 3.0.0 - - - - - - org.apache.maven.plugins - maven-compiler-plugin - - ${java.version} - ${java.version} - - - - - + + + + 4.0.0 + org.rsultan + java-ml-parent + 2.1.3-SNAPSHOT + java-ml-parent + pom + + + + release.archiva.rsultan.org + https://archiva.rsultan.org/repository/internal + + + + + 3.9.1 + 1.0 + 16 + 5.7.0 + ${java.version} + ${java.version} + UTF-8 + 2.0.0-alpha1 + 2.9.0 + + + + java-ml + java-ml-example + + + + + org.slf4j + slf4j-log4j12 + ${slf4j-log4j12.version} + + + org.slf4j + slf4j-api + ${slf4j-log4j12.version} + + + org.junit.jupiter + junit-jupiter + ${junit-jupiter.version} + test + + + org.assertj + assertj-core + ${assertj-core.version} + test + + + org.junit.jupiter + junit-jupiter-params + 5.7.0 + compile + + + + + java-ml + + + + maven-clean-plugin + 3.1.0 + + + maven-resources-plugin + 3.0.2 + + + maven-compiler-plugin + 3.8.0 + + + maven-surefire-plugin + 3.0.0-M5 + + + 1 + 0 + + + + + maven-jar-plugin + 3.0.2 + + + maven-install-plugin + 2.5.2 + + + maven-deploy-plugin + 2.8.2 + + + maven-site-plugin + 3.7.1 + + + maven-project-info-reports-plugin + 3.0.0 + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + ${java.version} + ${java.version} + + + + org.apache.maven.plugins + maven-source-plugin + + + attach-sources + + jar + + + + + + + \ No newline at end of file From 03e3e43695f0f35674bdc9847e7a10e270cc5953 Mon Sep 17 00:00:00 2001 From: JavaML autodeploy Date: Sun, 31 Oct 2021 17:24:11 +0000 Subject: [PATCH 2/3] release: v-2.1.3 --- java-ml-example/pom.xml | 4 ++-- java-ml/pom.xml | 4 ++-- pom.xml | 2 +- version.properties | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/java-ml-example/pom.xml b/java-ml-example/pom.xml index b0c6f82..de946f4 100644 --- a/java-ml-example/pom.xml +++ b/java-ml-example/pom.xml @@ -6,11 +6,11 @@ java-ml-parent org.rsultan - 2.1.3-SNAPSHOT + 2.1.3 java-ml-example - 2.1.3-SNAPSHOT + 2.1.3 java-ml-example diff --git a/java-ml/pom.xml b/java-ml/pom.xml index 9b8e905..e4e83c9 100644 --- a/java-ml/pom.xml +++ b/java-ml/pom.xml @@ -8,11 +8,11 @@ org.rsultan java-ml-parent - 2.1.3-SNAPSHOT + 2.1.3 java-ml - 2.1.3-SNAPSHOT + 2.1.3 java-ml diff --git a/pom.xml b/pom.xml index 807b1f5..90543f3 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.rsultan java-ml-parent - 2.1.3-SNAPSHOT + 2.1.3 java-ml-parent pom diff --git a/version.properties b/version.properties index 7665a6f..82f2bfb 100755 --- a/version.properties +++ b/version.properties @@ -1 +1 @@ -version.next=2.1.3 +version.next=2.1.4 From 25e2c40a1f019e5c595cedf8786cdf75f3ce0bc0 Mon Sep 17 00:00:00 2001 From: JavaML autodeploy Date: Sun, 31 Oct 2021 17:25:24 +0000 Subject: [PATCH 3/3] chore: jumping onto next development iteration --- java-ml-example/pom.xml | 4 ++-- java-ml/pom.xml | 4 ++-- pom.xml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/java-ml-example/pom.xml b/java-ml-example/pom.xml index de946f4..1329f0e 100644 --- a/java-ml-example/pom.xml +++ b/java-ml-example/pom.xml @@ -6,11 +6,11 @@ java-ml-parent org.rsultan - 2.1.3 + 2.1.4-SNAPSHOT java-ml-example - 2.1.3 + 2.1.4-SNAPSHOT java-ml-example diff --git a/java-ml/pom.xml b/java-ml/pom.xml index e4e83c9..78d8a98 100644 --- a/java-ml/pom.xml +++ b/java-ml/pom.xml @@ -8,11 +8,11 @@ org.rsultan java-ml-parent - 2.1.3 + 2.1.4-SNAPSHOT java-ml - 2.1.3 + 2.1.4-SNAPSHOT java-ml diff --git a/pom.xml b/pom.xml index 90543f3..fe5d5d3 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.rsultan java-ml-parent - 2.1.3 + 2.1.4-SNAPSHOT java-ml-parent pom