Merge pull request #456 from nvliyuan/main-v2410-release

update the main branch for 2410 release
NVIDIA · Oct 29, 2024 · 836bc2e · 836bc2e
2 parents 8ae5e2b + 82c2e36
commit 836bc2e
Show file tree

Hide file tree

Showing 79 changed files with 14,407 additions and 10,259 deletions.
diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml
@@ -18,25 +18,16 @@ name: auto-merge HEAD to BASE
 on:
   pull_request_target:
     branches:
-    - branch-24.08
+      - branch-*
     types: [closed]
 
 jobs:
   auto-merge:
     if: github.event.pull_request.merged == true
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          ref: branch-24.08 # force to fetch from latest upstream instead of PR ref
-
-      - name: auto-merge job
-        uses: ./.github/workflows/auto-merge
-        env:
-          OWNER: NVIDIA
-          REPO_NAME: spark-rapids-examples
-          HEAD: branch-24.08
-          BASE: branch-24.10
-          AUTOMERGE_TOKEN: ${{ secrets.AUTOMERGE_TOKEN }} # use to merge PR
-
+    uses: NVIDIA/spark-rapids-common/.github/workflows/auto-merge.yml@main
+    with:
+      owner: ${{ github.repository_owner }}
+      repo: spark-rapids-examples
+      branch: ${{ github.event.pull_request.base.ref }}
+    secrets:
+      token: ${{ secrets.AUTOMERGE_TOKEN }}
diff --git a/.github/workflows/auto-merge/Dockerfile b/.github/workflows/auto-merge/Dockerfile
diff --git a/.github/workflows/auto-merge/action.yml b/.github/workflows/auto-merge/action.yml
diff --git a/.github/workflows/auto-merge/automerge b/.github/workflows/auto-merge/automerge
diff --git a/.github/workflows/signoff-check.yml b/.github/workflows/signoff-check.yml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,12 +23,10 @@ jobs:
   signoff-check:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
-
-      - name: sigoff-check job
-        uses: ./.github/workflows/signoff-check
-        env:
-          OWNER: NVIDIA
-          REPO_NAME: spark-rapids-examples 
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          PULL_NUMBER: ${{ github.event.number }}
+      - name: signoff
+        uses: NVIDIA/spark-rapids-common/signoff-check@main
+        with:
+          owner: ${{ github.repository_owner }}
+          repo: spark-rapids-examples
+          pull_number: ${{ github.event.number }}
+          token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/signoff-check/Dockerfile b/.github/workflows/signoff-check/Dockerfile
diff --git a/.github/workflows/signoff-check/action.yml b/.github/workflows/signoff-check/action.yml
diff --git a/.github/workflows/signoff-check/signoff-check b/.github/workflows/signoff-check/signoff-check
diff --git a/README.md b/README.md
@@ -5,10 +5,10 @@ RAPIDS Accelerator for Apache Spark accelerates Spark applications with no code
 You can download the latest version of RAPIDS Accelerator [here](https://nvidia.github.io/spark-rapids/docs/download.html).
 This repo contains examples and applications that showcases the performance and benefits of using 
 RAPIDS Accelerator in data processing and machine learning pipelines. 
-There are broadly four categories of examples in this repo: 
+There are broadly five categories of examples in this repo: 
 1. [SQL/Dataframe](./examples/SQL+DF-Examples) 
 2. [Spark XGBoost](./examples/XGBoost-Examples) 
-3. [Deep Learning/Machine Learning](./examples/ML+DL-Examples) 
+3. [Machine Learning/Deep Learning](./examples/ML+DL-Examples) 
 4. [RAPIDS UDF](./examples/UDF-Examples)
 5. [Databricks Tools demo notebooks](./tools/databricks)
 
@@ -23,7 +23,8 @@ Here is the list of notebooks in this repo:
 | 3 | XGBoost | Agaricus (Scala) | Uses XGBoost classifier function to create model that can accurately differentiate between edible and poisonous mushrooms with the [agaricus dataset](https://archive.ics.uci.edu/ml/datasets/mushroom)
 | 4 | XGBoost | Mortgage (Scala) | End-to-end ETL + XGBoost example to predict mortgage default with [Fannie Mae Single-Family Loan Performance Data](https://capitalmarkets.fanniemae.com/credit-risk-transfer/single-family-credit-risk-transfer/fannie-mae-single-family-loan-performance-data)
 | 5 | XGBoost | Taxi (Scala) | End-to-end ETL + XGBoost example to predict taxi trip fare amount with [NYC taxi trips data set](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
-| 6 | ML/DL | PCA End-to-End | Spark MLlib based PCA example to train and transform with a synthetic dataset
+| 6 | ML/DL | PCA | [Spark-Rapids-ML](https://github.com/NVIDIA/spark-rapids-ml) based PCA example to train and transform with a synthetic dataset
+| 7 | ML/DL | DL Inference | 11 notebooks demonstrating distributed model inference on Spark using the `predict_batch_udf` across various frameworks: PyTorch, HuggingFace, and TensorFlow
 
 Here is the list of Apache Spark applications (Scala and PySpark) that 
 can be built for running on GPU with RAPIDS Accelerator in this repo:
@@ -33,7 +34,7 @@ can be built for running on GPU with RAPIDS Accelerator in this repo:
 | 1 | XGBoost | Agaricus (Scala) | Uses XGBoost classifier function to create model that can accurately differentiate between edible and poisonous mushrooms with the [agaricus dataset](https://archive.ics.uci.edu/ml/datasets/mushroom)
 | 2 | XGBoost | Mortgage (Scala) | End-to-end ETL + XGBoost example to predict mortgage default with [Fannie Mae Single-Family Loan Performance Data](https://capitalmarkets.fanniemae.com/credit-risk-transfer/single-family-credit-risk-transfer/fannie-mae-single-family-loan-performance-data)
 | 3 | XGBoost | Taxi (Scala) | End-to-end ETL + XGBoost example to predict taxi trip fare amount with [NYC taxi trips data set](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
-| 4 | ML/DL | PCA End-to-End | Spark MLlib based PCA example to train and transform with a synthetic dataset
+| 4 | ML/DL | PCA | [Spark-Rapids-ML](https://github.com/NVIDIA/spark-rapids-ml) based PCA example to train and transform with a synthetic dataset
 | 5 | UDF | URL Decode | Decodes URL-encoded strings using the [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/legacy/)
 | 6 | UDF | URL Encode | URL-encodes strings using the [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/legacy/)
 | 7 | UDF | [CosineSimilarity](./examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/java/com/nvidia/spark/rapids/udf/java/CosineSimilarity.java) | Computes the cosine similarity between two float vectors using [native code](./examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/cpp/src)