v1.3.0 (#28)

kunpengcompute · Jul 6, 2021 · 4f7d34c · 4f7d34c
1 parent 0a008a2
commit 4f7d34c
Show file tree

Hide file tree

Showing 1,007 changed files with 234,227 additions and 40 deletions.
diff --git a/README.md b/README.md
@@ -5,27 +5,33 @@
 Introduction
 ============
 
-The machine learning algorithm library running on Kunpeng processors is an acceleration library that provides a rich set of high-level tools for machine learning algorithms. It is based on the original APIs of Apache [Spark 2.3.2](https://github.com/apache/spark/tree/v2.3.2) and [breeze 0.13.1](https://github.com/scalanlp/breeze/tree/releases/v0.13.1). The acceleration library for greatly improves the computing power in big data scenarios.
+The machine learning algorithm library running on Kunpeng processors is an acceleration library that provides a rich set of high-level tools for machine learning algorithms. It is based on the original APIs of Apache [Spark 2.3.2](https://github.com/apache/spark/tree/v2.3.2), [breeze 0.13.1](https://github.com/scalanlp/breeze/tree/releases/v0.13.1) and [xgboost 1.1.0](https://github.com/dmlc/xgboost/tree/release_1.0.0). The acceleration library for greatly improves the computing power in big data scenarios.
 
-The library provides nine machine learning algorithms: support vector machine (SVM), random forest classifier (RFC), gradient boosting decision tree (GBDT), decision tree (DT), K-means clustering, linear regression, logistic regression algorithm, principal component analysis (PCA), singular value decomposition (SVD), latent dirichlet allocation (LDA), prefix-projected pattern prowth (Prefix-Span), alternating least squares (ALS), and K-nearest neighbors (KNN). You can find the latest documentation on the project web page. This README file contains only basic setup instructions.
+The library provides eighteen machine learning algorithms: support vector machine (SVM), random forest classifier (RFC), gradient boosting decision tree (GBDT), decision tree (DT), K-means clustering, linear regression, logistic regression algorithm, principal component analysis (PCA), singular value decomposition (SVD), latent dirichlet allocation (LDA), prefix-projected pattern prowth (Prefix-Span), alternating least squares (ALS), K-nearest neighbors (KNN), Covariance, Density-based spatial clustering of applicaitons with noise (DBSCAN), Pearson, Spearman, and XGboost. You can find the latest documentation on the project web page. This README file contains only basic setup instructions.
 You can find the latest documentation, including a programming guide, on the project web page. This README file only contains basic setup instructions.
 
 
 
 
 
-Building and Packaging
+Building And Packageing
 ====================
 
 (1) Build the project under the "Spark-ml-algo-lib" directory:
 
     mvn clean package
 
+(2) Build XGBoost project under the "Spark-ml-algo-lib/ml-xgboost/jvm-packages" directory:
 
-(2) Obtain "sophon-ml-core_2.11-1.2.0.jar" under the "Spark-ml-algo-lib/ml-core/target" directory.
+    mvn clean package
+
+(3) Obtain "boostkit-ml-core_2.11-1.3.0-spark2.3.2.jar" under the "Spark-ml-algo-lib/ml-core/target" directory.
+
+   Obtain "boostkit-ml-acc_2.11-1.3.0-spark2.3.2.jar" under the "Spark-ml-algo-lib/ml-accelerator/target" directory.
 
-   Obtain "sophon-ml-acc_2.11-1.2.0.jar" under the "Spark-ml-algo-lib/ml-accelerator/target" directory.
+   Obtain "boostkit-xgboost4j_2.11-1.3.0.jar" under the "Spark-ml-algo-lib/ml-xgboost/jvm-packages/boostkit-xgboost4j/target" directory.
 
+   Obtain "boostkit-xgboost4j-spark2.3.2_2.11-1.3.0.jar" under the "Spark-ml-algo-lib/ml-xgboost/jvm-packages/boostkit-xgboost4j-spark/target" directory.
 
 
 Contribution Guidelines

diff --git a/ml-accelerator/pom.xml b/ml-accelerator/pom.xml
@@ -1,26 +1,29 @@
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>sophon-ml</artifactId>
-    <version>1.2.0</version>
+    <artifactId>boostkit-ml</artifactId>
+    <version>1.3.0</version>
   </parent>
 
   <modelVersion>4.0.0</modelVersion>
-  <artifactId>sophon-ml-acc_2.11</artifactId>
-  <version>1.2.0</version>
+  <artifactId>boostkit-ml-acc_2.11</artifactId>
+  <version>1.3.0</version>
   <name>${project.artifactId}</name>
   <description>Spark ml algo accelerator</description>
 
   <dependencies>
     <dependency>
       <groupId>org.apache.spark</groupId>
-      <artifactId>sophon-ml-core_2.11</artifactId>
+      <artifactId>boostkit-ml-core_2.11</artifactId>
       <version>${project.version}</version>
+      <classifier>${spark.version}</classifier>
+
     </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
-      <artifactId>sophon-ml-kernel-client_2.11</artifactId>
+      <artifactId>boostkit-ml-kernel-client_2.11</artifactId>
       <version>${project.version}</version>
+      <classifier>${spark.version}</classifier>
       <scope>compile</scope>
     </dependency>
   </dependencies>
@@ -89,6 +92,9 @@
             <goals>
               <goal>jar</goal>
             </goals>
+            <configuration>
+              <classifier>${spark.version}</classifier>
+            </configuration>
           </execution>
           <execution>
             <id>default-jar</id>

diff --git a/ml-accelerator/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala b/ml-accelerator/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
@@ -233,14 +233,14 @@ class LinearSVC @Since("2.2.0") (
       var u = ic
       try {
         u = instances.sparkContext.getConf
-          .getDouble("spark.sophon.LinearSVC.inertiaCoefficient", ic)
+          .getDouble("spark.boostkit.LinearSVC.inertiaCoefficient", ic)
         if (u < 0.0) {
           throw new Exception
         }
       }
       catch {
         case x: Exception =>
-          throw new Exception("'spark.sophon.LinearSVC.inertiaCoefficient' value is invalid")
+          throw new Exception("'spark.boostkit.LinearSVC.inertiaCoefficient' value is invalid")
       }
       this.ic = u
 

diff --git a/ml-accelerator/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/ml-accelerator/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -995,26 +995,27 @@ object ALS extends DefaultParamsReadable[ALS] with Logging {
 
     var unpersistCycle = DEFAULT_UNPERSIST_CYCLE
     try {
-      unpersistCycle = sc.getConf.getInt("spark.sophon.ALS.unpersistCycle", DEFAULT_UNPERSIST_CYCLE)
+      unpersistCycle = sc.getConf.getInt("spark.boostkit.ALS.unpersistCycle",
+        DEFAULT_UNPERSIST_CYCLE)
       if (unpersistCycle < 0) {
         throw new Exception
       }
     }
     catch {
       case x: Exception =>
-        throw new Exception("'spark.sophon.ALS.unpersistCycle' value is invalid")
+        throw new Exception("'spark.boostkit.ALS.unpersistCycle' value is invalid")
     }
 
     var blockMaxRow = DEFAULT_BLOCK_MAX_ROW
     try {
-      blockMaxRow = sc.getConf.getInt("spark.sophon.ALS.blockMaxRow", DEFAULT_BLOCK_MAX_ROW)
+      blockMaxRow = sc.getConf.getInt("spark.boostkit.ALS.blockMaxRow", DEFAULT_BLOCK_MAX_ROW)
       if (blockMaxRow < 0) {
         throw new Exception
       }
     }
     catch {
       case x: Exception =>
-        throw new Exception("'spark.sophon.ALS.blockMaxRow' value is invalid")
+        throw new Exception("'spark.boostkit.ALS.blockMaxRow' value is invalid")
     }
     if (implicitPrefs) {
       val dataIterI = new Array[RDD[(Int, ALS.FactorBlock)]](unpersistCycle)

diff --git a/ml-accelerator/src/main/scala/org/apache/spark/ml/stat/Correlation.scala b/ml-accelerator/src/main/scala/org/apache/spark/ml/stat/Correlation.scala
@@ -0,0 +1,93 @@
+// scalastyle:off header.matches
+/*
+* Copyright (C) 2021. Huawei Technologies Co., Ltd.
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+* */
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.stat
+
+import scala.collection.JavaConverters._
+
+import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.ml.linalg.{SQLDataTypes, Vector}
+import org.apache.spark.mllib.linalg.{Vectors => OldVectors}
+import org.apache.spark.mllib.stat.{Statistics => OldStatistics}
+import org.apache.spark.sql.{DataFrame, Dataset, Row}
+import org.apache.spark.sql.types.{StructField, StructType}
+
+/**
+ * API for correlation functions in MLlib, compatible with DataFrames and Datasets.
+ *
+ * The functions in this package generalize the functions in [[org.apache.spark.sql.Dataset#stat]]
+ * to spark.ml's Vector types.
+ */
+@Since("2.2.0")
+@Experimental
+object Correlation {
+
+  /**
+   * :: Experimental ::
+   * Compute the correlation matrix for the input Dataset of Vectors using the specified method.
+   * Methods currently supported: `pearson` (default), `spearman`.
+   *
+   * @param dataset A dataset or a dataframe
+   * @param column The name of the column of vectors for which the correlation coefficient needs
+   *               to be computed. This must be a column of the dataset, and it must contain
+   *               Vector objects.
+   * @param method String specifying the method to use for computing correlation.
+   *               Supported: `pearson` (default), `spearman`
+   * @return A dataframe that contains the correlation matrix of the column of vectors. This
+   *         dataframe contains a single row and a single column of name
+   *         '$METHODNAME($COLUMN)'.
+   * @throws IllegalArgumentException if the column is not a valid column in the dataset, or if
+   *                                  the content of this column is not of type Vector.
+   *
+   *  Here is how to access the correlation coefficient:
+   *  {{{
+   *    val data: Dataset[Vector] = ...
+   *    val Row(coeff: Matrix) = Correlation.corr(data, "value").head
+   *    // coeff now contains the Pearson correlation matrix.
+   *  }}}
+   *
+   * @note For Spearman, a rank correlation, we need to create an RDD[Double] for each column
+   * and sort it in order to retrieve the ranks and then join the columns back into an RDD[Vector],
+   * which is fairly costly. Cache the input Dataset before calling corr with `method = "spearman"`
+   * to avoid recomputing the common lineage.
+   */
+  @Since("2.2.0")
+  def corr(dataset: Dataset[_], column: String, method: String): DataFrame = {
+    val rdd = dataset.select(column).rdd.map {
+      case Row(v: Vector) => OldVectors.fromML(v)
+    }
+    val oldM = OldStatistics.corr(rdd, method)
+    val name = s"$method($column)"
+    val schema = StructType(Array(StructField(name, SQLDataTypes.MatrixType, nullable = false)))
+    dataset.sparkSession.createDataFrame(Seq(Row(oldM.asML)).asJava, schema)
+  }
+
+  /**
+   * Compute the Pearson correlation matrix for the input Dataset of Vectors.
+   */
+  @Since("2.2.0")
+  def corr(dataset: Dataset[_], column: String): DataFrame = {
+    corr(dataset, column, "pearson")
+  }
+}
diff --git a/ml-accelerator/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala b/ml-accelerator/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
@@ -153,7 +153,7 @@ private[spark] object GradientBoostedTrees extends Logging {
     }
   }
 
-  private val extraParamKey = "spark.sophon.ml.gbdt.doUseAcc"
+  private val extraParamKey = "spark.boostkit.ml.gbdt.doUseAcc"
   private val doUseAccDefault = true
 
   private def getDoUseAccFromSparkConf(sc: SparkContext): Boolean = {
@@ -163,7 +163,7 @@ private[spark] object GradientBoostedTrees extends Logging {
         doUseAcctStr.get.toBoolean
       } catch {
         case ex: Exception =>
-          throw new IllegalArgumentException(s"Parse sophon parameter" +
+          throw new IllegalArgumentException(s"Parse boostkit parameter" +
             s"($extraParamKey) failed, Error reason: ${ex.getMessage}")
       }
     } else {

diff --git a/ml-accelerator/src/main/scala/org/apache/spark/mllib/clustering/KMACCm.scala b/ml-accelerator/src/main/scala/org/apache/spark/mllib/clustering/KMACCm.scala
@@ -74,15 +74,15 @@ object KMACCm {
 
     var sampleRate = DEFAULT_SAMPLE_RATE
     try {
-      sampleRate = sc.getConf.getDouble("spark.sophon.Kmeans.sampleRate",
+      sampleRate = sc.getConf.getDouble("spark.boostkit.Kmeans.sampleRate",
         DEFAULT_SAMPLE_RATE)
       if (sampleRate < 0.0) {
         throw new Exception
       }
     }
     catch {
       case x: Exception =>
-        throw new Exception("'spark.sophon.Kmeans.sampleRate' value is invalid")
+        throw new Exception("'spark.boostkit.Kmeans.sampleRate' value is invalid")
     }
 
     while (iteration < maxIterations && !converged) {

diff --git a/ml-accelerator/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/ml-accelerator/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -273,9 +273,9 @@ class KMeans private(
     // Execute iterations of Lloyd's algorithm until converged
     if (cl > 1) {
       val methodEnum = Array("default", "allData")
-      val method = sc.getConf.get("spark.sophon.Kmeans.optMethod", "default")
+      val method = sc.getConf.get("spark.boostkit.Kmeans.optMethod", "default")
       if (!methodEnum.contains(method)) {
-        throw new Exception("'spark.sophon.Kmeans.optMethod' value is invalid")
+        throw new Exception("'spark.boostkit.Kmeans.optMethod' value is invalid")
       }
       if (method == "allData") {
         KMACCm.compute(data, centers, maxIterations, epsilon, false)

diff --git a/ml-accelerator/src/main/scala/org/apache/spark/mllib/stat/correlation/Correlation.scala b/ml-accelerator/src/main/scala/org/apache/spark/mllib/stat/correlation/Correlation.scala
@@ -0,0 +1,103 @@
+// scalastyle:off header.matches
+/*
+* Copyright (C) 2021. Huawei Technologies Co., Ltd.
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+* */
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat.correlation
+
+import org.apache.spark.mllib.linalg.{DenseVector, Matrix, Vector}
+import org.apache.spark.rdd.RDD
+
+/**
+ * Trait for correlation algorithms.
+ */
+private[stat] trait Correlation {
+
+  /**
+   * Compute correlation for two datasets.
+   */
+  def computeCorrelation(x: RDD[Double], y: RDD[Double]): Double
+
+  /**
+   * Compute the correlation matrix S, for the input matrix, where S(i, j) is the correlation
+   * between column i and j. S(i, j) can be NaN if the correlation is undefined for column i and j.
+   */
+  def computeCorrelationMatrix(X: RDD[Vector]): Matrix
+
+  /**
+   * Combine the two input RDD[Double]s into an RDD[Vector] and compute the correlation using the
+   * correlation implementation for RDD[Vector]. Can be NaN if correlation is undefined for the
+   * input vectors.
+   */
+  def computeCorrelationWithMatrixImpl(x: RDD[Double], y: RDD[Double]): Double = {
+    val mat: RDD[Vector] = x.zip(y).map { case (xi, yi) => new DenseVector(Array(xi, yi)) }
+    computeCorrelationMatrix(mat)(0, 1)
+  }
+
+}
+
+/**
+ * Delegates computation to the specific correlation object based on the input method name.
+ */
+private[stat] object Correlations {
+
+  def corr(x: RDD[Double],
+       y: RDD[Double],
+       method: String = CorrelationNames.defaultCorrName): Double = {
+    val correlation = getCorrelationFromName(method)
+    correlation.computeCorrelation(x, y)
+  }
+
+  def corrMatrix(X: RDD[Vector],
+      method: String = CorrelationNames.defaultCorrName): Matrix = {
+    val correlation = getCorrelationFromName(method)
+    correlation.computeCorrelationMatrix(X)
+  }
+
+  // Match input correlation name with a known name via simple string matching.
+  def getCorrelationFromName(method: String): Correlation = {
+    try {
+      CorrelationNames.nameToObjectMap(method)
+    } catch {
+      case nse: NoSuchElementException =>
+        throw new IllegalArgumentException("Unrecognized method name. Supported correlations: "
+          + CorrelationNames.nameToObjectMap.keys.mkString(", "))
+    }
+  }
+}
+
+/**
+ * Maintains supported and default correlation names.
+ *
+ * Currently supported correlations: `pearson`, `spearman`.
+ * Current default correlation: `pearson`.
+ *
+ * After new correlation algorithms are added, please update the documentation here and in
+ * Statistics.scala for the correlation APIs.
+ */
+private[mllib] object CorrelationNames {
+
+  // Note: after new types of correlations are implemented, please update this map.
+  val nameToObjectMap = Map(("pearson", PearsonCorrelation), ("spearman", SpearmanCorrelation))
+  val defaultCorrName: String = "pearson"
+
+}