diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml index 233bddef3cd..9758d746c87 100644 --- a/.github/workflows/auto-merge.yml +++ b/.github/workflows/auto-merge.yml @@ -18,7 +18,7 @@ name: auto-merge HEAD to BASE on: pull_request_target: branches: - - branch-24.06 + - branch-24.08 types: [closed] jobs: @@ -29,13 +29,13 @@ jobs: steps: - uses: actions/checkout@v4 with: - ref: branch-24.06 # force to fetch from latest upstream instead of PR ref + ref: branch-24.08 # force to fetch from latest upstream instead of PR ref - name: auto-merge job uses: ./.github/workflows/auto-merge env: OWNER: NVIDIA REPO_NAME: spark-rapids - HEAD: branch-24.06 - BASE: branch-24.08 + HEAD: branch-24.08 + BASE: branch-24.10 AUTOMERGE_TOKEN: ${{ secrets.AUTOMERGE_TOKEN }} # use to merge PR diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml index 447f3d5049b..93557017b08 100644 --- a/.github/workflows/blossom-ci.yml +++ b/.github/workflows/blossom-ci.yml @@ -74,7 +74,10 @@ jobs: github.actor == 'binmahone' || github.actor == 'zpuller' || github.actor == 'pxLi' || - github.actor == 'Feng-Jiang28' + github.actor == 'Feng-Jiang28' || + github.actor == 'SurajAralihalli' || + github.actor == 'jihoonson' || + github.actor == 'ustcfy' ) steps: - name: Check if comment is issued by authorized person diff --git a/.github/workflows/mvn-verify-check.yml b/.github/workflows/mvn-verify-check.yml index 734dd8e6bb9..86e12a4a32b 100644 --- a/.github/workflows/mvn-verify-check.yml +++ b/.github/workflows/mvn-verify-check.yml @@ -40,10 +40,9 @@ jobs: runs-on: ubuntu-latest outputs: dailyCacheKey: ${{ steps.generateCacheKey.outputs.dailyCacheKey }} - defaultSparkVersion: ${{ steps.allShimVersionsStep.outputs.defaultSparkVersion }} - sparkTailVersions: ${{ steps.allShimVersionsStep.outputs.tailVersions }} - sparkJDKVersions: ${{ steps.allShimVersionsStep.outputs.jdkVersions }} - scala213Versions: ${{ steps.allShimVersionsStep.outputs.scala213Versions }} + defaultSparkVersion: ${{ steps.all212ShimVersionsStep.outputs.defaultSparkVersion }} + sparkTailVersions: ${{ steps.all212ShimVersionsStep.outputs.tailVersions }} + sparkJDKVersions: ${{ steps.all212ShimVersionsStep.outputs.jdkVersions }} steps: - uses: actions/checkout@v4 # refs/pull/:prNumber/merge - uses: actions/setup-java@v4 @@ -65,31 +64,12 @@ jobs: restore-keys: ${{ runner.os }}-maven- - name: populate-daily-cache if: steps.cache.outputs.cache-hit != 'true' + env: + SCALA_VER: '2.12' run: | - set -x - max_retry=3; delay=30; i=1 - while true; do - for pom in pom.xml scala2.13/pom.xml - do - mvn ${{ env.COMMON_MVN_FLAGS }} --file $pom help:evaluate -pl dist \ - -Dexpression=included_buildvers \ - -DforceStdout -PnoSnapshots -q | tr -d ',' | \ - xargs -n 1 bash -c \ - 'mvn ${{ env.COMMON_MVN_FLAGS }} --file $1 -Dbuildver=$2 de.qaware.maven:go-offline-maven-plugin:resolve-dependencies' _ $pom - - # compile base versions to cache scala compiler and compiler bridge - mvn ${{ env.COMMON_MVN_FLAGS }} --file $pom \ - process-test-resources -pl sql-plugin-api -am - done && break || { - if [[ $i -le $max_retry ]]; then - echo "mvn command failed. Retry $i/$max_retry."; ((i++)); sleep $delay; ((delay=delay*2)) - else - echo "mvn command failed. Exit 1"; exit 1 - fi - } - done + . .github/workflows/mvn-verify-check/populate-daily-cache.sh - name: all shim versions - id: allShimVersionsStep + id: all212ShimVersionsStep run: | set -x . jenkins/version-def.sh @@ -113,30 +93,12 @@ jobs: jdkHeadVersionArrBody=$(printf ",{\"spark-version\":\"%s\",\"java-version\":8}" "${SPARK_BASE_SHIM_VERSION}") # jdk11 jdk11VersionArrBody=$(printf ",{\"spark-version\":\"%s\",\"java-version\":11}" "${SPARK_SHIM_VERSIONS_JDK11[@]}") - # jdk17 - jdk17VersionArrBody=$(printf ",{\"spark-version\":\"%s\",\"java-version\":17}" "${SPARK_SHIM_VERSIONS_JDK17[@]}") # jdk - jdkVersionArrBody=$jdkHeadVersionArrBody$jdk11VersionArrBody$jdk17VersionArrBody + jdkVersionArrBody=$jdkHeadVersionArrBody$jdk11VersionArrBody jdkVersionArrBody=${jdkVersionArrBody:1} jdkVersionJsonStr=$(printf {\"include\":[%s]} $jdkVersionArrBody) echo "jdkVersions=$jdkVersionJsonStr" >> $GITHUB_OUTPUT - SCALA_BINARY_VER=2.13 - . jenkins/version-def.sh - svArrBodyNoSnapshot=$(printf ",{\"spark-version\":\"%s\",\"isSnapshot\":false}" "${SPARK_SHIM_VERSIONS_NOSNAPSHOTS[@]}") - svArrBodyNoSnapshot=${svArrBodyNoSnapshot:1} - # get private artifact version - privateVer=$(mvn help:evaluate -q -pl dist -Dexpression=spark-rapids-private.version -DforceStdout) - # do not add empty snapshot versions or when private version is released one (does not include snapshot shims) - if [[ ${#SPARK_SHIM_VERSIONS_SNAPSHOTS_ONLY[@]} -gt 0 && $privateVer == *"-SNAPSHOT" ]]; then - svArrBodySnapshot=$(printf ",{\"spark-version\":\"%s\",\"isSnapshot\":true}" "${SPARK_SHIM_VERSIONS_SNAPSHOTS_ONLY[@]}") - svArrBodySnapshot=${svArrBodySnapshot:1} - svJsonStr=$(printf {\"include\":[%s]} $svArrBodyNoSnapshot,$svArrBodySnapshot) - else - svJsonStr=$(printf {\"include\":[%s]} $svArrBodyNoSnapshot) - fi - - echo "scala213Versions=$svJsonStr" >> $GITHUB_OUTPUT package-tests: needs: cache-dependencies @@ -187,27 +149,82 @@ jobs: } done + cache-dependencies-scala213: + runs-on: ubuntu-latest + outputs: + scala213dailyCacheKey: ${{ steps.generateCacheKey.outputs.scala213dailyCacheKey }} + scala213Versions: ${{ steps.all213ShimVersionsStep.outputs.scala213Versions }} + sparkJDK17Versions: ${{ steps.all213ShimVersionsStep.outputs.jdkVersions }} + steps: + - uses: actions/checkout@v4 # refs/pull/:prNumber/merge + - uses: actions/setup-java@v4 + with: + distribution: 'temurin' + java-version: 17 + - name: Generate daily cache key + id: generateCacheKey + run: | + set -x + cacheKey="${{ runner.os }}-maven-scala213-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-$(date +'%Y-%m-%d')" + echo "scala213dailyCacheKey=$cacheKey" | tee $GITHUB_ENV $GITHUB_OUTPUT + - name: Cache local Maven repository + id: cache + uses: actions/cache@v4 + with: + path: ~/.m2 + key: ${{ env.scala213dailyCacheKey }} + restore-keys: ${{ runner.os }}-maven- + - name: populate-daily-cache + if: steps.cache.outputs.cache-hit != 'true' + env: + SCALA_VER: '2.13' + run: | + . .github/workflows/mvn-verify-check/populate-daily-cache.sh + - name: all 213 shim verions + id: all213ShimVersionsStep + run: | + set -x + SCALA_BINARY_VER=2.13 + . jenkins/version-def.sh + svArrBodyNoSnapshot=$(printf ",{\"spark-version\":\"%s\",\"isSnapshot\":false}" "${SPARK_SHIM_VERSIONS_NOSNAPSHOTS[@]}") + svArrBodyNoSnapshot=${svArrBodyNoSnapshot:1} + # get private artifact version + privateVer=$(mvn help:evaluate -q -pl dist -Dexpression=spark-rapids-private.version -DforceStdout) + svJsonStr=$(printf {\"include\":[%s]} $svArrBodyNoSnapshot) + + echo "scala213Versions=$svJsonStr" >> $GITHUB_OUTPUT + + # jdk17 + jdk17VersionArrBody=$(printf ",{\"spark-version\":\"%s\",\"java-version\":17}" "${SPARK_SHIM_VERSIONS_JDK17_SCALA213[@]}") + + jdkVersionArrBody=$jdk17VersionArrBody + jdkVersionArrBody=${jdkVersionArrBody:1} + jdkVersionJsonStr=$(printf {\"include\":[%s]} $jdkVersionArrBody) + echo "jdkVersions=$jdkVersionJsonStr" >> $GITHUB_OUTPUT + + package-tests-scala213: - needs: cache-dependencies + needs: cache-dependencies-scala213 continue-on-error: ${{ matrix.isSnapshot }} strategy: - matrix: ${{ fromJSON(needs.cache-dependencies.outputs.scala213Versions) }} + matrix: ${{ fromJSON(needs.cache-dependencies-scala213.outputs.scala213Versions) }} fail-fast: false runs-on: ubuntu-latest steps: + - uses: actions/checkout@v4 # refs/pull/:prNumber/merge - name: Setup Java and Maven Env uses: actions/setup-java@v4 with: distribution: adopt - java-version: 8 + java-version: 17 - name: Cache local Maven repository uses: actions/cache@v4 with: path: ~/.m2 - key: ${{ needs.cache-dependencies.outputs.dailyCacheKey }} + key: ${{ needs.cache-dependencies-scala213.outputs.scala213dailyCacheKey }} - name: check runtime before tests run: | @@ -218,7 +235,7 @@ jobs: run: | # https://github.com/NVIDIA/spark-rapids/issues/8847 # specify expected versions - export JAVA_HOME=${JAVA_HOME_8_X64} + export JAVA_HOME=${JAVA_HOME_17_X64} export PATH=${JAVA_HOME}/bin:${PATH} java -version && mvn --version && echo "ENV JAVA_HOME: $JAVA_HOME, PATH: $PATH" # verify Scala 2.13 build files @@ -246,8 +263,63 @@ jobs: } done + verify-213-modules: + needs: cache-dependencies-scala213 + runs-on: ubuntu-latest + strategy: + matrix: ${{ fromJSON(needs.cache-dependencies-scala213.outputs.sparkJDK17Versions) }} + steps: + - uses: actions/checkout@v4 # refs/pull/:prNumber/merge + + - name: Setup Java and Maven Env + uses: actions/setup-java@v4 + with: + distribution: adopt + java-version: 17 + + - name: Cache local Maven repository + uses: actions/cache@v4 + with: + path: ~/.m2 + key: ${{ needs.cache-dependencies-scala213.outputs.scala213dailyCacheKey }} + + - name: check runtime before tests + run: | + env | grep JAVA + java -version && mvn --version && echo "ENV JAVA_HOME: $JAVA_HOME, PATH: $PATH" + + - name: Build JDK + run: | + # https://github.com/NVIDIA/spark-rapids/issues/8847 + # specify expected versions + export JAVA_HOME=${JAVA_HOME_${{ matrix.java-version }}_X64} + export PATH=${JAVA_HOME}/bin:${PATH} + java -version && mvn --version && echo "ENV JAVA_HOME: $JAVA_HOME, PATH: $PATH" + # verify Scala 2.13 build files + ./build/make-scala-version-build-files.sh 2.13 + # verify git status + if [ -n "$(echo -n $(git status -s | grep 'scala2.13'))" ]; then + git add -N scala2.13/* && git diff 'scala2.13/*' + echo "Generated Scala 2.13 build files don't match what's in repository" + exit 1 + fi + # change to Scala 2.13 Directory + cd scala2.13 + # test command, will retry for 3 times if failed. + max_retry=3; delay=30; i=1 + while true; do + mvn verify \ + -P "individual,pre-merge,source-javadoc" -Dbuildver=${{ matrix.spark-version }} \ + ${{ env.COMMON_MVN_FLAGS }} && break || { + if [[ $i -le $max_retry ]]; then + echo "mvn command failed. Retry $i/$max_retry."; ((i++)); sleep $delay; ((delay=delay*2)) + else + echo "mvn command failed. Exit 1"; exit 1 + fi + } + done - verify-all-modules: + verify-all-212-modules: needs: cache-dependencies runs-on: ubuntu-latest strategy: diff --git a/.github/workflows/mvn-verify-check/populate-daily-cache.sh b/.github/workflows/mvn-verify-check/populate-daily-cache.sh new file mode 100755 index 00000000000..b93cd0b6b49 --- /dev/null +++ b/.github/workflows/mvn-verify-check/populate-daily-cache.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +max_retry=3; delay=30; i=1 +if [[ $SCALA_VER == '2.12' ]]; then + pom='pom.xml' +elif [[ $SCALA_VER == '2.13' ]]; then + pom='scala2.13/pom.xml' +fi +while true; do + { + python build/get_buildvers.py "no_snapshots.buildvers" $pom | tr -d ',' | \ + xargs -n 1 -I {} bash -c \ + "mvn $COMMON_MVN_FLAGS --file $pom -Dbuildver={} de.qaware.maven:go-offline-maven-plugin:resolve-dependencies" + + # compile base versions to cache scala compiler and compiler bridge + mvn $COMMON_MVN_FLAGS --file $pom \ + process-test-resources -pl sql-plugin-api -am + } && break || { + if [[ $i -le $max_retry ]]; then + echo "mvn command failed. Retry $i/$max_retry."; ((i++)); sleep $delay; ((delay=delay*2)) + else + echo "mvn command failed. Exit 1"; exit 1 + fi +} +done \ No newline at end of file diff --git a/.github/workflows/signoff-check.yml b/.github/workflows/signoff-check.yml index 076f72c7f28..8ae20f2c295 100644 --- a/.github/workflows/signoff-check.yml +++ b/.github/workflows/signoff-check.yml @@ -23,12 +23,10 @@ jobs: signoff-check: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 - - - name: sigoff-check job - uses: ./.github/workflows/signoff-check - env: - OWNER: NVIDIA - REPO_NAME: spark-rapids - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - PULL_NUMBER: ${{ github.event.number }} + - name: signoff + uses: NVIDIA/spark-rapids-common/signoff-check@main + with: + owner: ${{ github.repository_owner }} + repo: spark-rapids + pull_number: ${{ github.event.number }} + token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/signoff-check/Dockerfile b/.github/workflows/signoff-check/Dockerfile deleted file mode 100644 index 84c1171dba3..00000000000 --- a/.github/workflows/signoff-check/Dockerfile +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -FROM python:3.8-slim-buster - -WORKDIR / -COPY signoff-check . -RUN pip install PyGithub && chmod +x /signoff-check - -# require envs: OWNER,REPO_NAME,GITHUB_TOKEN,PULL_NUMBER -ENTRYPOINT ["/signoff-check"] diff --git a/.github/workflows/signoff-check/action.yml b/.github/workflows/signoff-check/action.yml deleted file mode 100644 index d5f58f1d2a7..00000000000 --- a/.github/workflows/signoff-check/action.yml +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -name: 'signoff check action' -description: 'check if PR got signed off' -runs: - using: 'docker' - image: 'Dockerfile' diff --git a/.github/workflows/signoff-check/signoff-check b/.github/workflows/signoff-check/signoff-check deleted file mode 100755 index 76cbf17988d..00000000000 --- a/.github/workflows/signoff-check/signoff-check +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) 2020, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""A signoff check - -The tool checks if any commit got signoff in a pull request. - -NOTE: this script is for github actions only, you should not use it anywhere else. -""" -import os -import re -import sys -from argparse import ArgumentParser - -from github import Github - -SIGNOFF_REGEX = re.compile('Signed-off-by:') - - -def signoff(token: str, owner: str, repo_name: str, pull_number: int): - gh = Github(token, per_page=100, user_agent='signoff-check', verify=True) - pr = gh.get_repo(f"{owner}/{repo_name}").get_pull(pull_number) - for c in pr.get_commits(): - if SIGNOFF_REGEX.search(c.commit.message): - print('Found signoff.\n') - print(f"Commit sha:\n{c.commit.sha}") - print(f"Commit message:\n{c.commit.message}") - return True - return False - - -def main(token: str, owner: str, repo_name: str, pull_number: int): - try: - if not signoff(token, owner, repo_name, pull_number): - raise Exception('No commits w/ signoff') - except Exception as e: # pylint: disable=broad-except - print(e) - sys.exit(1) - - -if __name__ == '__main__': - parser = ArgumentParser(description="signoff check") - parser.add_argument("--owner", help="repo owner", default='') - parser.add_argument("--repo_name", help="repo name", default='') - parser.add_argument("--token", help="github token, will use GITHUB_TOKEN if empty", default='') - parser.add_argument("--pull_number", help="pull request number", type=int) - args = parser.parse_args() - - GITHUB_TOKEN = args.token if args.token else os.environ.get('GITHUB_TOKEN') - assert GITHUB_TOKEN, 'env GITHUB_TOKEN should not be empty' - OWNER = args.owner if args.owner else os.environ.get('OWNER') - assert OWNER, 'env OWNER should not be empty' - REPO_NAME = args.repo_name if args.repo_name else os.environ.get('REPO_NAME') - assert REPO_NAME, 'env REPO_NAME should not be empty' - PULL_NUMBER = args.pull_number if args.pull_number else int(os.environ.get('PULL_NUMBER')) - assert PULL_NUMBER, 'env PULL_NUMBER should not be empty' - - main(token=GITHUB_TOKEN, owner=OWNER, repo_name=REPO_NAME, pull_number=PULL_NUMBER) diff --git a/.gitignore b/.gitignore index 867c8af50aa..ed8a60ec20a 100644 --- a/.gitignore +++ b/.gitignore @@ -34,4 +34,5 @@ scalastyle-on-compile.generated.xml scalastyle-output.xml scalastyle.txt target/ -cufile.log \ No newline at end of file +cufile.log +build/*.class diff --git a/CHANGELOG.md b/CHANGELOG.md index 788fed0a98f..02e43a88303 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,209 @@ # Change log -Generated on 2024-06-13 +Generated on 2024-08-18 + +## Release 24.08 + +### Features +||| +|:---|:---| +|[#9259](https://github.com/NVIDIA/spark-rapids/issues/9259)|[FEA] Create Spark 4.0.0 shim and build env| +|[#10366](https://github.com/NVIDIA/spark-rapids/issues/10366)|[FEA] It would be nice if we could support Hive-style write bucketing table| +|[#10987](https://github.com/NVIDIA/spark-rapids/issues/10987)|[FEA] Implement lore framework to support all operators.| +|[#11087](https://github.com/NVIDIA/spark-rapids/issues/11087)|[FEA] Support regex pattern with brackets when rewrite to PrefixRange patten in rlike| +|[#22](https://github.com/NVIDIA/spark-rapids/issues/22)|[FEA] Add support for bucketed writes| +|[#9939](https://github.com/NVIDIA/spark-rapids/issues/9939)|[FEA] `GpuInsertIntoHiveTable` supports parquet format| + +### Performance +||| +|:---|:---| +|[#8750](https://github.com/NVIDIA/spark-rapids/issues/8750)|[FEA] Rework GpuSubstringIndex to use cudf::slice_strings| +|[#7404](https://github.com/NVIDIA/spark-rapids/issues/7404)|[FEA] explore a hash agg passthrough on partial aggregates| +|[#10976](https://github.com/NVIDIA/spark-rapids/issues/10976)|Rewrite `pattern1|pattern2|pattern3` to multiple contains in `rlike`| + +### Bugs Fixed +||| +|:---|:---| +|[#11287](https://github.com/NVIDIA/spark-rapids/issues/11287)|[BUG] String split APIs on empty string produce incorrect result| +|[#11270](https://github.com/NVIDIA/spark-rapids/issues/11270)|[BUG] test_regexp_replace[DATAGEN_SEED=1722297411, TZ=UTC] hanging there forever in pre-merge CI intermittently| +|[#9682](https://github.com/NVIDIA/spark-rapids/issues/9682)|[BUG] Casting FLOAT64 to DECIMAL(12,7) produces different rows from Apache Spark CPU| +|[#10809](https://github.com/NVIDIA/spark-rapids/issues/10809)|[BUG] cast(9.95 as decimal(3,1)), actual: 9.9, expected: 10.0| +|[#11266](https://github.com/NVIDIA/spark-rapids/issues/11266)|[BUG] test_broadcast_hash_join_constant_keys failed in databricks runtimes| +|[#11243](https://github.com/NVIDIA/spark-rapids/issues/11243)|[BUG] ArrayIndexOutOfBoundsException on a left outer join| +|[#11030](https://github.com/NVIDIA/spark-rapids/issues/11030)|Fix tests failures in string_test.py| +|[#11245](https://github.com/NVIDIA/spark-rapids/issues/11245)|[BUG] mvn verify for the source-javadoc fails and no pre-merge check catches it| +|[#11223](https://github.com/NVIDIA/spark-rapids/issues/11223)|[BUG] Remove unreferenced `CUDF_VER=xxx` in the CI script| +|[#11114](https://github.com/NVIDIA/spark-rapids/issues/11114)|[BUG] Update nightly tests for Scala 2.13 to use JDK 17 only| +|[#11229](https://github.com/NVIDIA/spark-rapids/issues/11229)|[BUG] test_delta_name_column_mapping_no_field_ids fails on Spark | +|[#11031](https://github.com/NVIDIA/spark-rapids/issues/11031)|Fix tests failures in multiple files | +|[#10948](https://github.com/NVIDIA/spark-rapids/issues/10948)|Figure out why `MapFromArrays ` appears in the tests for hive parquet write| +|[#11018](https://github.com/NVIDIA/spark-rapids/issues/11018)|Fix tests failures in hash_aggregate_test.py| +|[#11173](https://github.com/NVIDIA/spark-rapids/issues/11173)|[BUG] The `rs. serialization time` metric is misleading| +|[#11017](https://github.com/NVIDIA/spark-rapids/issues/11017)|Fix tests failures in url_test.py| +|[#11201](https://github.com/NVIDIA/spark-rapids/issues/11201)|[BUG] Delta Lake tables with name mapping can throw exceptions on read| +|[#11175](https://github.com/NVIDIA/spark-rapids/issues/11175)|[BUG] Clean up unused and duplicated 'org/roaringbitmap' folder in the spark3xx shims| +|[#11196](https://github.com/NVIDIA/spark-rapids/issues/11196)|[BUG] pipeline failed due to class not found exception: NoClassDefFoundError: com/nvidia/spark/rapids/GpuScalar| +|[#11189](https://github.com/NVIDIA/spark-rapids/issues/11189)|[BUG] regression in NDS after PR #11170| +|[#11167](https://github.com/NVIDIA/spark-rapids/issues/11167)|[BUG] UnsupportedOperationException during delta write with `optimize()`| +|[#11172](https://github.com/NVIDIA/spark-rapids/issues/11172)|[BUG] `get_json_object` returns wrong output with wildcard path| +|[#11148](https://github.com/NVIDIA/spark-rapids/issues/11148)|[BUG] Integration test `test_write_hive_bucketed_table` fails| +|[#11155](https://github.com/NVIDIA/spark-rapids/issues/11155)|[BUG] ArrayIndexOutOfBoundsException in BatchWithPartitionData.splitColumnarBatch| +|[#11152](https://github.com/NVIDIA/spark-rapids/issues/11152)|[BUG] LORE dumping consumes too much memory.| +|[#11029](https://github.com/NVIDIA/spark-rapids/issues/11029)|Fix tests failures in subquery_test.py| +|[#11150](https://github.com/NVIDIA/spark-rapids/issues/11150)|[BUG] hive_parquet_write_test.py::test_insert_hive_bucketed_table failure| +|[#11070](https://github.com/NVIDIA/spark-rapids/issues/11070)|[BUG] numpy2 fail fastparquet cases: numpy.dtype size changed| +|[#11136](https://github.com/NVIDIA/spark-rapids/issues/11136)|UnaryPositive expression doesn't extend UnaryExpression| +|[#11122](https://github.com/NVIDIA/spark-rapids/issues/11122)|[BUG] UT MetricRange failed 651070526 was not less than 1.5E8 in spark313| +|[#11119](https://github.com/NVIDIA/spark-rapids/issues/11119)|[BUG] window_function_test.py::test_window_group_limits_fallback_for_row_number fails in a distributed environment| +|[#11023](https://github.com/NVIDIA/spark-rapids/issues/11023)|Fix tests failures in dpp_test.py| +|[#11026](https://github.com/NVIDIA/spark-rapids/issues/11026)|Fix tests failures in map_test.py| +|[#11020](https://github.com/NVIDIA/spark-rapids/issues/11020)|Fix tests failures in grouping_sets_test.py| +|[#11113](https://github.com/NVIDIA/spark-rapids/issues/11113)|[BUG] Update premerge tests for Scala 2.13 to use JDK 17 only| +|[#11027](https://github.com/NVIDIA/spark-rapids/issues/11027)|Fix tests failures in sort_test.py| +|[#10775](https://github.com/NVIDIA/spark-rapids/issues/10775)|[BUG] Issues found by Spark UT Framework on RapidsStringExpressionsSuite| +|[#11033](https://github.com/NVIDIA/spark-rapids/issues/11033)|[BUG] CICD failed a case: cmp_test.py::test_empty_filter[>]| +|[#11103](https://github.com/NVIDIA/spark-rapids/issues/11103)|[BUG] UCX Shuffle With scala.MatchError | +|[#11007](https://github.com/NVIDIA/spark-rapids/issues/11007)|Fix tests failures in array_test.py| +|[#10801](https://github.com/NVIDIA/spark-rapids/issues/10801)|[BUG] JDK17 nightly build after Spark UT Framework is merged| +|[#11019](https://github.com/NVIDIA/spark-rapids/issues/11019)|Fix tests failures in window_function_test.py| +|[#11063](https://github.com/NVIDIA/spark-rapids/issues/11063)|[BUG] op time for GpuCoalesceBatches is more than actual| +|[#11006](https://github.com/NVIDIA/spark-rapids/issues/11006)|Fix test failures in arithmetic_ops_test.py| +|[#10995](https://github.com/NVIDIA/spark-rapids/issues/10995)|Fallback TimeZoneAwareExpression that only support UTC with zoneId instead of timeZone config| +|[#8652](https://github.com/NVIDIA/spark-rapids/issues/8652)|[BUG] array_item test failures on Spark 3.3.x| +|[#11053](https://github.com/NVIDIA/spark-rapids/issues/11053)|[BUG] Build on Databricks 330 fails| +|[#10925](https://github.com/NVIDIA/spark-rapids/issues/10925)| Concat cannot accept no parameter| +|[#10975](https://github.com/NVIDIA/spark-rapids/issues/10975)|[BUG] regex `^.*literal` cannot be rewritten as `contains(literal)` for multiline strings| +|[#10956](https://github.com/NVIDIA/spark-rapids/issues/10956)|[BUG] hive_parquet_write_test.py: test_write_compressed_parquet_into_hive_table integration test failures| +|[#10772](https://github.com/NVIDIA/spark-rapids/issues/10772)|[BUG] Issues found by Spark UT Framework on RapidsDataFrameAggregateSuite| +|[#10986](https://github.com/NVIDIA/spark-rapids/issues/10986)|[BUG]Cast from string to float using hand-picked values failed in CastOpSuite| +|[#10972](https://github.com/NVIDIA/spark-rapids/issues/10972)|Spark 4.0 compile errors | +|[#10794](https://github.com/NVIDIA/spark-rapids/issues/10794)|[BUG] Incorrect cast of string columns containing various infinity notations with trailing spaces | +|[#10964](https://github.com/NVIDIA/spark-rapids/issues/10964)|[BUG] Improve stability of pre-merge jenkinsfile| +|[#10714](https://github.com/NVIDIA/spark-rapids/issues/10714)|Signature changed for `PythonUDFRunner.writeUDFs` | +|[#10712](https://github.com/NVIDIA/spark-rapids/issues/10712)|[AUDIT] BatchScanExec/DataSourceV2Relation to group splits by join keys if they differ from partition keys| +|[#10673](https://github.com/NVIDIA/spark-rapids/issues/10673)|[AUDIT] Rename plan nodes for PythonMapInArrowExec| +|[#10710](https://github.com/NVIDIA/spark-rapids/issues/10710)|[AUDIT] `uncacheTableOrView` changed in CommandUtils | +|[#10711](https://github.com/NVIDIA/spark-rapids/issues/10711)|[AUDIT] Match DataSourceV2ScanExecBase changes to groupPartitions method | +|[#10669](https://github.com/NVIDIA/spark-rapids/issues/10669)|Supporting broadcast of multiple filtering keys in DynamicPruning | + +### PRs +||| +|:---|:---| +|[#11353](https://github.com/NVIDIA/spark-rapids/pull/11353)|Update download doc for v24.08.1 [skip ci]| +|[#11352](https://github.com/NVIDIA/spark-rapids/pull/11352)|Update version to 24.08.1-SNAPSHOT [skip ci]| +|[#11335](https://github.com/NVIDIA/spark-rapids/pull/11335)|Fix Delta Lake truncation of min/max string values| +|[#11304](https://github.com/NVIDIA/spark-rapids/pull/11304)|Update changelog for v24.08.0 release [skip ci]| +|[#11303](https://github.com/NVIDIA/spark-rapids/pull/11303)|Update rapids JNI and private dependency to 24.08.0| +|[#11296](https://github.com/NVIDIA/spark-rapids/pull/11296)|[DOC] update doc for 2408 release [skip CI]| +|[#11309](https://github.com/NVIDIA/spark-rapids/pull/11309)|[Doc ]Update lore doc about the range [skip ci]| +|[#11292](https://github.com/NVIDIA/spark-rapids/pull/11292)|Add work around for string split with empty input.| +|[#11278](https://github.com/NVIDIA/spark-rapids/pull/11278)|Fix formatting of advanced configs doc| +|[#10917](https://github.com/NVIDIA/spark-rapids/pull/10917)|Adopt changes from JNI for casting from float to decimal| +|[#11269](https://github.com/NVIDIA/spark-rapids/pull/11269)|Revert "upgrade ucx to 1.17.0"| +|[#11260](https://github.com/NVIDIA/spark-rapids/pull/11260)|Mitigate intermittent test_buckets and shuffle_smoke_test OOM issue| +|[#11268](https://github.com/NVIDIA/spark-rapids/pull/11268)|Fix degenerate conditional nested loop join detection| +|[#11244](https://github.com/NVIDIA/spark-rapids/pull/11244)|Fix ArrayIndexOutOfBoundsException on join counts with constant join keys| +|[#11259](https://github.com/NVIDIA/spark-rapids/pull/11259)|CI Docker to support integration tests with Rocky OS + jdk17 [skip ci]| +|[#11247](https://github.com/NVIDIA/spark-rapids/pull/11247)|Fix `string_test.py` errors on Spark 4.0| +|[#11246](https://github.com/NVIDIA/spark-rapids/pull/11246)|Rework Maven Source Plugin Skip| +|[#11149](https://github.com/NVIDIA/spark-rapids/pull/11149)|Rework on substring index| +|[#11236](https://github.com/NVIDIA/spark-rapids/pull/11236)|Remove the unused vars from the version-def CI script| +|[#11237](https://github.com/NVIDIA/spark-rapids/pull/11237)|Fork jvm for maven-source-plugin| +|[#11200](https://github.com/NVIDIA/spark-rapids/pull/11200)|Multi-get_json_object| +|[#11230](https://github.com/NVIDIA/spark-rapids/pull/11230)|Skip test where Delta Lake may not be fully compatible with Spark| +|[#11220](https://github.com/NVIDIA/spark-rapids/pull/11220)|Avoid failing spark bug SPARK-44242 while generate run_dir| +|[#11226](https://github.com/NVIDIA/spark-rapids/pull/11226)|Fix auto merge conflict 11212| +|[#11129](https://github.com/NVIDIA/spark-rapids/pull/11129)|Spark 4: Fix miscellaneous tests including logic, repart, hive_delimited.| +|[#11163](https://github.com/NVIDIA/spark-rapids/pull/11163)|Support `MapFromArrays` on GPU| +|[#11219](https://github.com/NVIDIA/spark-rapids/pull/11219)|Fix hash_aggregate_test.py to run with ANSI enabled| +|[#11186](https://github.com/NVIDIA/spark-rapids/pull/11186)|from_json Json to Struct Exception Logging| +|[#11180](https://github.com/NVIDIA/spark-rapids/pull/11180)|More accurate estimation for the result serialization time in RapidsShuffleThreadedWriterBase| +|[#11194](https://github.com/NVIDIA/spark-rapids/pull/11194)|Fix ANSI mode test failures in url_test.py| +|[#11202](https://github.com/NVIDIA/spark-rapids/pull/11202)|Fix read from Delta Lake table with name column mapping and missing Parquet IDs| +|[#11185](https://github.com/NVIDIA/spark-rapids/pull/11185)|Fix multi-release jar problem| +|[#11144](https://github.com/NVIDIA/spark-rapids/pull/11144)|Build the Scala2.13 dist jar with JDK17| +|[#11197](https://github.com/NVIDIA/spark-rapids/pull/11197)|Fix class not found error: com/nvidia/spark/rapids/GpuScalar| +|[#11191](https://github.com/NVIDIA/spark-rapids/pull/11191)|Fix dynamic pruning regression in GpuFileSourceScanExec| +|[#10994](https://github.com/NVIDIA/spark-rapids/pull/10994)|Add Spark 4.0.0 Build Profile and Other Supporting Changes| +|[#11192](https://github.com/NVIDIA/spark-rapids/pull/11192)|Append new authorized user to blossom-ci whitelist [skip ci]| +|[#11179](https://github.com/NVIDIA/spark-rapids/pull/11179)|Allow more expressions to be tiered| +|[#11141](https://github.com/NVIDIA/spark-rapids/pull/11141)|Enable some Rapids config in RapidsSQLTestsBaseTrait for Spark UT| +|[#11170](https://github.com/NVIDIA/spark-rapids/pull/11170)|Avoid listFiles or inputFiles on relations with static partitioning| +|[#11159](https://github.com/NVIDIA/spark-rapids/pull/11159)|Drop spark31x shims| +|[#10951](https://github.com/NVIDIA/spark-rapids/pull/10951)|Case when performance improvement: reduce the `copy_if_else`| +|[#11165](https://github.com/NVIDIA/spark-rapids/pull/11165)|Fix some GpuBroadcastToRowExec by not dropping columns| +|[#11126](https://github.com/NVIDIA/spark-rapids/pull/11126)|Coalesce batches after a logical coalesce operation| +|[#11164](https://github.com/NVIDIA/spark-rapids/pull/11164)|fix the bucketed write error for non-utc cases| +|[#11132](https://github.com/NVIDIA/spark-rapids/pull/11132)|Add deletion vector metrics for low shuffle merge.| +|[#11156](https://github.com/NVIDIA/spark-rapids/pull/11156)|Fix batch splitting for partition column size on row-count-only batches| +|[#11153](https://github.com/NVIDIA/spark-rapids/pull/11153)|Fix LORE dump oom.| +|[#11102](https://github.com/NVIDIA/spark-rapids/pull/11102)|Fix ANSI mode failures in subquery_test.py| +|[#11151](https://github.com/NVIDIA/spark-rapids/pull/11151)|Fix the test error of the bucketed write for the non-utc case| +|[#11147](https://github.com/NVIDIA/spark-rapids/pull/11147)|upgrade ucx to 1.17.0| +|[#11138](https://github.com/NVIDIA/spark-rapids/pull/11138)|Update fastparquet to 2024.5.0 for numpy2 compatibility| +|[#11137](https://github.com/NVIDIA/spark-rapids/pull/11137)|Handle the change for UnaryPositive now extending RuntimeReplaceable| +|[#11094](https://github.com/NVIDIA/spark-rapids/pull/11094)|Add `HiveHash` support on GPU| +|[#11139](https://github.com/NVIDIA/spark-rapids/pull/11139)|Improve MetricsSuite to allow more gc jitter| +|[#11133](https://github.com/NVIDIA/spark-rapids/pull/11133)|Fix `test_window_group_limits_fallback`| +|[#11097](https://github.com/NVIDIA/spark-rapids/pull/11097)|Fix miscellaneous integ tests for Spark 4| +|[#11118](https://github.com/NVIDIA/spark-rapids/pull/11118)|Fix issue with DPP and AQE on reused broadcast exchanges| +|[#11043](https://github.com/NVIDIA/spark-rapids/pull/11043)|Dataproc serverless test fixes| +|[#10965](https://github.com/NVIDIA/spark-rapids/pull/10965)|Profiler: Disable collecting async allocation events by default| +|[#11117](https://github.com/NVIDIA/spark-rapids/pull/11117)|Update Scala2.13 premerge CI against JDK17| +|[#11084](https://github.com/NVIDIA/spark-rapids/pull/11084)|Introduce LORE framework.| +|[#11099](https://github.com/NVIDIA/spark-rapids/pull/11099)|Spark 4: Handle ANSI mode in sort_test.py| +|[#11115](https://github.com/NVIDIA/spark-rapids/pull/11115)|Fix match error in RapidsShuffleIterator.scala [scala2.13]| +|[#11088](https://github.com/NVIDIA/spark-rapids/pull/11088)|Support regex patterns with brackets when rewriting to PrefixRange pattern in rlike.| +|[#10950](https://github.com/NVIDIA/spark-rapids/pull/10950)|Add a heuristic to skip second or third agg pass| +|[#11048](https://github.com/NVIDIA/spark-rapids/pull/11048)|Fixed array_tests for Spark 4.0.0| +|[#11049](https://github.com/NVIDIA/spark-rapids/pull/11049)|Fix some cast_tests for Spark 4.0.0| +|[#11066](https://github.com/NVIDIA/spark-rapids/pull/11066)|Replaced spark3xx-common references to spark-shared| +|[#11083](https://github.com/NVIDIA/spark-rapids/pull/11083)|Exclude a case based on JDK version in Spark UT| +|[#10997](https://github.com/NVIDIA/spark-rapids/pull/10997)|Fix some test issues in Spark UT and keep RapidsTestSettings update-to-date| +|[#11073](https://github.com/NVIDIA/spark-rapids/pull/11073)|Disable ANSI mode for window function tests| +|[#11076](https://github.com/NVIDIA/spark-rapids/pull/11076)|Improve the diagnostics for 'conv' fallback explain| +|[#11092](https://github.com/NVIDIA/spark-rapids/pull/11092)|Add GpuBucketingUtils shim to Spark 4.0.0| +|[#11062](https://github.com/NVIDIA/spark-rapids/pull/11062)|fix duplicate counted metrics like op time for GpuCoalesceBatches| +|[#11044](https://github.com/NVIDIA/spark-rapids/pull/11044)|Fixed Failing tests in arithmetic_ops_tests for Spark 4.0.0| +|[#11086](https://github.com/NVIDIA/spark-rapids/pull/11086)|upgrade blossom-ci actions version [skip ci]| +|[#10957](https://github.com/NVIDIA/spark-rapids/pull/10957)|Support bucketing write for GPU| +|[#10979](https://github.com/NVIDIA/spark-rapids/pull/10979)|[FEA] Introduce low shuffle merge.| +|[#10996](https://github.com/NVIDIA/spark-rapids/pull/10996)|Fallback non-UTC TimeZoneAwareExpression with zoneId| +|[#11072](https://github.com/NVIDIA/spark-rapids/pull/11072)|Workaround numpy2 failed fastparquet compatibility tests| +|[#11046](https://github.com/NVIDIA/spark-rapids/pull/11046)|Calculate parallelism to speed up pre-merge CI| +|[#11054](https://github.com/NVIDIA/spark-rapids/pull/11054)|fix flaky array_item test failures| +|[#11051](https://github.com/NVIDIA/spark-rapids/pull/11051)|[FEA] Increase parallelism of deltalake test on databricks| +|[#10993](https://github.com/NVIDIA/spark-rapids/pull/10993)|`binary-dedupe` changes for Spark 4.0.0| +|[#11060](https://github.com/NVIDIA/spark-rapids/pull/11060)|Add in the ability to fingerprint JSON columns| +|[#11059](https://github.com/NVIDIA/spark-rapids/pull/11059)|Revert "Add in the ability to fingerprint JSON columns (#11002)" [skip ci]| +|[#11039](https://github.com/NVIDIA/spark-rapids/pull/11039)|Concat() Exception bug fix| +|[#11002](https://github.com/NVIDIA/spark-rapids/pull/11002)|Add in the ability to fingerprint JSON columns| +|[#10977](https://github.com/NVIDIA/spark-rapids/pull/10977)|Rewrite multiple literal choice regex to multiple contains in rlike| +|[#11035](https://github.com/NVIDIA/spark-rapids/pull/11035)|Fix auto merge conflict 11034 [skip ci]| +|[#11040](https://github.com/NVIDIA/spark-rapids/pull/11040)|Append new authorized user to blossom-ci whitelist [skip ci]| +|[#11036](https://github.com/NVIDIA/spark-rapids/pull/11036)|Update blossom-ci ACL to secure format [skip ci]| +|[#11032](https://github.com/NVIDIA/spark-rapids/pull/11032)|Fix a hive write test failure for Spark 350| +|[#10998](https://github.com/NVIDIA/spark-rapids/pull/10998)|Improve log to print more lines in build [skip ci]| +|[#10992](https://github.com/NVIDIA/spark-rapids/pull/10992)|Addressing the Named Parameter change in Spark 4.0.0| +|[#10943](https://github.com/NVIDIA/spark-rapids/pull/10943)|Fix Spark UT issues in RapidsDataFrameAggregateSuite| +|[#10963](https://github.com/NVIDIA/spark-rapids/pull/10963)|Add rapids configs to enable GPU running in Spark UT| +|[#10978](https://github.com/NVIDIA/spark-rapids/pull/10978)|More compilation fixes for Spark 4.0.0| +|[#10953](https://github.com/NVIDIA/spark-rapids/pull/10953)|Speed up the integration tests by running them in parallel on the Databricks cluster| +|[#10958](https://github.com/NVIDIA/spark-rapids/pull/10958)|Fix a hive write test failure| +|[#10970](https://github.com/NVIDIA/spark-rapids/pull/10970)|Move Support for `RaiseError` to a Shim Excluding Spark 4.0.0| +|[#10966](https://github.com/NVIDIA/spark-rapids/pull/10966)|Add default value for REF of premerge jenkinsfile to avoid bad overwritten [skip ci]| +|[#10959](https://github.com/NVIDIA/spark-rapids/pull/10959)|Add new ID to blossom-ci allow list [skip ci]| +|[#10952](https://github.com/NVIDIA/spark-rapids/pull/10952)|Add shims to take care of the signature change for writeUDFs in PythonUDFRunner| +|[#10931](https://github.com/NVIDIA/spark-rapids/pull/10931)|Add Support for Renaming of PythonMapInArrow| +|[#10949](https://github.com/NVIDIA/spark-rapids/pull/10949)|Change dependency version to 24.08.0-SNAPSHOT| +|[#10857](https://github.com/NVIDIA/spark-rapids/pull/10857)|[Spark 4.0] Account for `PartitionedFileUtil.splitFiles` signature change.| +|[#10912](https://github.com/NVIDIA/spark-rapids/pull/10912)|GpuInsertIntoHiveTable supports parquet format| +|[#10863](https://github.com/NVIDIA/spark-rapids/pull/10863)|[Spark 4.0] Account for `CommandUtils.uncacheTableOrView` signature change.| +|[#10944](https://github.com/NVIDIA/spark-rapids/pull/10944)|Added Shim for BatchScanExec to Support Spark 4.0| +|[#10946](https://github.com/NVIDIA/spark-rapids/pull/10946)|Unarchive Spark test jar for spark.read(ability)| +|[#10945](https://github.com/NVIDIA/spark-rapids/pull/10945)|Add Support for Multiple Filtering Keys for Subquery Broadcast| +|[#10871](https://github.com/NVIDIA/spark-rapids/pull/10871)|Add classloader diagnostics to initShuffleManager error message| +|[#10933](https://github.com/NVIDIA/spark-rapids/pull/10933)|Fixed Databricks build| +|[#10929](https://github.com/NVIDIA/spark-rapids/pull/10929)|Append new authorized user to blossom-ci whitelist [skip ci]| ## Release 24.06 @@ -48,10 +252,18 @@ Generated on 2024-06-13 ### PRs ||| |:---|:---| +|[#11222](https://github.com/NVIDIA/spark-rapids/pull/11222)|Update change log for v24.06.1 release [skip ci]| +|[#11221](https://github.com/NVIDIA/spark-rapids/pull/11221)|Change cudf version back to 24.06.0-SNAPSHOT [skip ci]| +|[#11217](https://github.com/NVIDIA/spark-rapids/pull/11217)|Update latest changelog [skip ci]| +|[#11211](https://github.com/NVIDIA/spark-rapids/pull/11211)|Use fixed seed for test_from_json_struct_decimal| +|[#11203](https://github.com/NVIDIA/spark-rapids/pull/11203)|Update version to 24.06.1-SNAPSHOT| +|[#11205](https://github.com/NVIDIA/spark-rapids/pull/11205)|Update docs for 24.06.1 release [skip ci]| +|[#11056](https://github.com/NVIDIA/spark-rapids/pull/11056)|Update latest changelog [skip ci]| |[#11052](https://github.com/NVIDIA/spark-rapids/pull/11052)|Add spark343 shim for scala2.13 dist jar| |[#10981](https://github.com/NVIDIA/spark-rapids/pull/10981)|Update latest changelog [skip ci]| |[#10984](https://github.com/NVIDIA/spark-rapids/pull/10984)|[DOC] Update docs for 24.06.0 release [skip ci]| |[#10974](https://github.com/NVIDIA/spark-rapids/pull/10974)|Update rapids JNI and private dependency to 24.06.0| +|[#10830](https://github.com/NVIDIA/spark-rapids/pull/10830)|Use ErrorClass to Throw AnalysisException| |[#10947](https://github.com/NVIDIA/spark-rapids/pull/10947)|Prevent contains-PrefixRange optimization if not preceded by wildcards| |[#10934](https://github.com/NVIDIA/spark-rapids/pull/10934)|Revert "Add Support for Multiple Filtering Keys for Subquery Broadcast "| |[#10870](https://github.com/NVIDIA/spark-rapids/pull/10870)|Add support for self-contained profiling| @@ -69,7 +281,6 @@ Generated on 2024-06-13 |[#10833](https://github.com/NVIDIA/spark-rapids/pull/10833)|Filter out unused json_path tokens| |[#10855](https://github.com/NVIDIA/spark-rapids/pull/10855)|Fix auto merge conflict 10845 [[skip ci]]| |[#10826](https://github.com/NVIDIA/spark-rapids/pull/10826)|Add NVTX ranges to identify Spark stages and tasks| -|[#10846](https://github.com/NVIDIA/spark-rapids/pull/10846)|Update latest changelog [skip ci]| |[#10836](https://github.com/NVIDIA/spark-rapids/pull/10836)|Catch exceptions when trying to examine Iceberg scan for metadata queries| |[#10824](https://github.com/NVIDIA/spark-rapids/pull/10824)|Support zstd for GPU shuffle compression| |[#10828](https://github.com/NVIDIA/spark-rapids/pull/10828)|Added DateTimeUtilsShims [Databricks]| @@ -79,7 +290,6 @@ Generated on 2024-06-13 |[#10715](https://github.com/NVIDIA/spark-rapids/pull/10715)|Rewrite some rlike expression to StartsWith/Contains| |[#10820](https://github.com/NVIDIA/spark-rapids/pull/10820)|workaround #10801 temporally| |[#10812](https://github.com/NVIDIA/spark-rapids/pull/10812)|Replace ThreadPoolExecutor creation with ThreadUtils API| -|[#10816](https://github.com/NVIDIA/spark-rapids/pull/10816)|Fix a test error for DB13.3| |[#10813](https://github.com/NVIDIA/spark-rapids/pull/10813)|Fix the errors for Pandas UDF tests on DB13.3| |[#10795](https://github.com/NVIDIA/spark-rapids/pull/10795)|Remove fixed seed for exact `percentile` integration tests| |[#10805](https://github.com/NVIDIA/spark-rapids/pull/10805)|Drop Support for CentOS 7| @@ -117,182 +327,5 @@ Generated on 2024-06-13 |[#10655](https://github.com/NVIDIA/spark-rapids/pull/10655)|Update rapids jni/private dependency to 24.06| |[#10652](https://github.com/NVIDIA/spark-rapids/pull/10652)|Substitute murmurHash32 for spark32BitMurmurHash3| -## Release 24.04 - -### Features -||| -|:---|:---| -|[#10263](https://github.com/NVIDIA/spark-rapids/issues/10263)|[FEA] Add support for reading JSON containing structs where rows are not consistent| -|[#10436](https://github.com/NVIDIA/spark-rapids/issues/10436)|[FEA] Move Spark 3.5.1 out of snapshot once released| -|[#10430](https://github.com/NVIDIA/spark-rapids/issues/10430)|[FEA] Error out when running on an unsupported GPU architecture| -|[#9750](https://github.com/NVIDIA/spark-rapids/issues/9750)|[FEA] Review `JsonToStruct` and `JsonScan` and consolidate some testing and implementation| -|[#8680](https://github.com/NVIDIA/spark-rapids/issues/8680)|[AUDIT][SPARK-42779][SQL] Allow V2 writes to indicate advisory shuffle partition size| -|[#10429](https://github.com/NVIDIA/spark-rapids/issues/10429)|[FEA] Drop support for Databricks 10.4 ML LTS| -|[#10334](https://github.com/NVIDIA/spark-rapids/issues/10334)|[FEA] Turn on memory limits for parquet reader| -|[#10344](https://github.com/NVIDIA/spark-rapids/issues/10344)|[FEA] support barrier mode for mapInPandas/mapInArrow| - -### Performance -||| -|:---|:---| -|[#10578](https://github.com/NVIDIA/spark-rapids/issues/10578)|[FEA] Support project expression rewrite for the case ```stringinstr(str_col, substr) > 0``` to ```contains(str_col, substr)```| -|[#10570](https://github.com/NVIDIA/spark-rapids/issues/10570)|[FEA] See if we can optimize sort for a single batch| -|[#10531](https://github.com/NVIDIA/spark-rapids/issues/10531)|[FEA] Support "WindowGroupLimit" optimization on GPU for Databricks 13.3 ML LTS+| -|[#5553](https://github.com/NVIDIA/spark-rapids/issues/5553)|[FEA][Audit] - Push down StringEndsWith/Contains to Parquet | -|[#8208](https://github.com/NVIDIA/spark-rapids/issues/8208)|[FEA][AUDIT][SPARK-37099][SQL] Introduce the group limit of Window for rank-based filter to optimize top-k computation| -|[#10249](https://github.com/NVIDIA/spark-rapids/issues/10249)|[FEA] Support common subexpression elimination for expand operator| -|[#10301](https://github.com/NVIDIA/spark-rapids/issues/10301)|[FEA] Improve performance of from_json| - -### Bugs Fixed -||| -|:---|:---| -|[#10700](https://github.com/NVIDIA/spark-rapids/issues/10700)|[BUG] get_json_object cannot handle ints or boolean values| -|[#10645](https://github.com/NVIDIA/spark-rapids/issues/10645)|[BUG] java.lang.IllegalStateException: Expected to only receive a single batch| -|[#10665](https://github.com/NVIDIA/spark-rapids/issues/10665)|[BUG] Need to update private jar's version to v24.04.1 for spark-rapids v24.04.0 release| -|[#10589](https://github.com/NVIDIA/spark-rapids/issues/10589)|[BUG] ZSTD version mismatch in integration tests| -|[#10255](https://github.com/NVIDIA/spark-rapids/issues/10255)|[BUG] parquet_tests are skipped on Dataproc CI| -|[#10624](https://github.com/NVIDIA/spark-rapids/issues/10624)|[BUG] Deploy script "gpg:sign-and-deploy-file failed: 401 Unauthorized| -|[#10631](https://github.com/NVIDIA/spark-rapids/issues/10631)|[BUG] pending `BlockState` leaks blocks if the shuffle read doesn't finish successfully| -|[#10349](https://github.com/NVIDIA/spark-rapids/issues/10349)|[BUG]Test in json_test.py failed: test_from_json_struct_decimal| -|[#9033](https://github.com/NVIDIA/spark-rapids/issues/9033)|[BUG] GpuGetJsonObject does not expand escaped characters| -|[#10216](https://github.com/NVIDIA/spark-rapids/issues/10216)|[BUG] GetJsonObject fails at spark unit test $.store.book[*].reader| -|[#10217](https://github.com/NVIDIA/spark-rapids/issues/10217)|[BUG] GetJsonObject fails at spark unit test $.store.basket[0][*].b| -|[#10537](https://github.com/NVIDIA/spark-rapids/issues/10537)|[BUG] GetJsonObject throws exception when json path contains a name starting with `'`| -|[#10194](https://github.com/NVIDIA/spark-rapids/issues/10194)|[BUG] GetJsonObject does not validate the input is JSON in the same way as Spark| -|[#10196](https://github.com/NVIDIA/spark-rapids/issues/10196)|[BUG] GetJsonObject does not process escape sequences in returned strings or queries| -|[#10212](https://github.com/NVIDIA/spark-rapids/issues/10212)|[BUG] GetJsonObject should return null for invalid query instead of throwing an exception| -|[#10218](https://github.com/NVIDIA/spark-rapids/issues/10218)|[BUG] GetJsonObject does not normalize non-string output| -|[#10591](https://github.com/NVIDIA/spark-rapids/issues/10591)|[BUG] `test_column_add_after_partition` failed on EGX Standalone cluster| -|[#10277](https://github.com/NVIDIA/spark-rapids/issues/10277)|Add monitoring for GH action deprecations| -|[#10627](https://github.com/NVIDIA/spark-rapids/issues/10627)|[BUG] Integration tests FAILED on: "nvCOMP 2.3/2.4 or newer is required for Zstandard compression"| -|[#10585](https://github.com/NVIDIA/spark-rapids/issues/10585)|[BUG]Test simple pinned blocking alloc Failed nightly tests| -|[#10586](https://github.com/NVIDIA/spark-rapids/issues/10586)|[BUG] YARN EGX IT build failing parquet_testing_test can't find file| -|[#10133](https://github.com/NVIDIA/spark-rapids/issues/10133)|[BUG] test_hash_reduction_collect_set_on_nested_array_type failed in a distributed environment| -|[#10378](https://github.com/NVIDIA/spark-rapids/issues/10378)|[BUG] `test_range_running_window_float_decimal_sum_runs_batched` fails intermittently| -|[#10486](https://github.com/NVIDIA/spark-rapids/issues/10486)|[BUG] StructsToJson does not fall back to the CPU for unsupported timeZone options| -|[#10484](https://github.com/NVIDIA/spark-rapids/issues/10484)|[BUG] JsonToStructs does not fallback when columnNameOfCorruptRecord is set| -|[#10460](https://github.com/NVIDIA/spark-rapids/issues/10460)|[BUG] JsonToStructs should reject float numbers for integer types| -|[#10468](https://github.com/NVIDIA/spark-rapids/issues/10468)|[BUG] JsonToStructs and ScanJson should not treat quoted strings as valid integers| -|[#10470](https://github.com/NVIDIA/spark-rapids/issues/10470)|[BUG] ScanJson and JsonToStructs should support parsing quoted decimal strings that are formatted by local (at least for en-US)| -|[#10494](https://github.com/NVIDIA/spark-rapids/issues/10494)|[BUG] JsonToStructs parses INF wrong when nonNumericNumbers is enabled| -|[#10456](https://github.com/NVIDIA/spark-rapids/issues/10456)|[BUG] allowNonNumericNumbers OFF supported for JSON Scan, but not JsonToStructs| -|[#10467](https://github.com/NVIDIA/spark-rapids/issues/10467)|[BUG] JsonToStructs should reject 1. as a valid number| -|[#10469](https://github.com/NVIDIA/spark-rapids/issues/10469)|[BUG] ScanJson should accept "1." as a valid Decimal| -|[#10559](https://github.com/NVIDIA/spark-rapids/issues/10559)|[BUG] test_spark_from_json_date_with_format FAILED on : Part of the plan is not columnar class org.apache.spark.sql.execution.ProjectExec| -|[#10209](https://github.com/NVIDIA/spark-rapids/issues/10209)|[BUG] Test failure hash_aggregate_test.py::test_hash_reduction_collect_set_on_nested_array_type DATAGEN_SEED=1705515231| -|[#10319](https://github.com/NVIDIA/spark-rapids/issues/10319)|[BUG] Shuffled join OOM with 4GB of GPU memory| -|[#10507](https://github.com/NVIDIA/spark-rapids/issues/10507)|[BUG] regexp_test.py FAILED test_regexp_extract_all_idx_positive[DATAGEN_SEED=1709054829, INJECT_OOM]| -|[#10527](https://github.com/NVIDIA/spark-rapids/issues/10527)|[BUG] Build on Databricks failed with GpuGetJsonObject.scala:19: object parsing is not a member of package util| -|[#10509](https://github.com/NVIDIA/spark-rapids/issues/10509)|[BUG] scalar leaks when running nds query51| -|[#10214](https://github.com/NVIDIA/spark-rapids/issues/10214)|[BUG] GetJsonObject does not support unquoted array like notation| -|[#10215](https://github.com/NVIDIA/spark-rapids/issues/10215)|[BUG] GetJsonObject removes leading space characters| -|[#10213](https://github.com/NVIDIA/spark-rapids/issues/10213)|[BUG] GetJsonObject supports array index notation without a root| -|[#10452](https://github.com/NVIDIA/spark-rapids/issues/10452)|[BUG] JsonScan and from_json share fallback checks, but have hard coded names in the results| -|[#10455](https://github.com/NVIDIA/spark-rapids/issues/10455)|[BUG] JsonToStructs and ScanJson do not fall back/support it properly if single quotes are disabled| -|[#10219](https://github.com/NVIDIA/spark-rapids/issues/10219)|[BUG] GetJsonObject sees a double quote in a single quoted string as invalid| -|[#10431](https://github.com/NVIDIA/spark-rapids/issues/10431)|[BUG] test_casting_from_overflow_double_to_timestamp `DID NOT RAISE `| -|[#10499](https://github.com/NVIDIA/spark-rapids/issues/10499)|[BUG] Unit tests core dump as below| -|[#9325](https://github.com/NVIDIA/spark-rapids/issues/9325)|[BUG] test_csv_infer_schema_timestamp_ntz fails| -|[#10422](https://github.com/NVIDIA/spark-rapids/issues/10422)|[BUG] test_get_json_object_single_quotes failure| -|[#10411](https://github.com/NVIDIA/spark-rapids/issues/10411)|[BUG] Some fast parquet tests fail if the time zone is not UTC| -|[#10410](https://github.com/NVIDIA/spark-rapids/issues/10410)|[BUG]delta_lake_update_test.py::test_delta_update_partitions[['a', 'b']-False] failed by DATAGEN_SEED=1707683137| -|[#10404](https://github.com/NVIDIA/spark-rapids/issues/10404)|[BUG] GpuJsonTuple memory leak| -|[#10382](https://github.com/NVIDIA/spark-rapids/issues/10382)|[BUG] Complile failed on branch-24.04 : literals.scala:32: object codec is not a member of package org.apache.commons| - -### PRs -||| -|:---|:---| -|[#10844](https://github.com/NVIDIA/spark-rapids/pull/10844)|Update rapids private dependency to 24.04.3| -|[#10788](https://github.com/NVIDIA/spark-rapids/pull/10788)|[DOC] Update archive page for v24.04.1 [skip ci]| -|[#10784](https://github.com/NVIDIA/spark-rapids/pull/10784)|Update latest changelog [skip ci]| -|[#10782](https://github.com/NVIDIA/spark-rapids/pull/10782)|Update latest changelog [skip ci]| -|[#10780](https://github.com/NVIDIA/spark-rapids/pull/10780)|[DOC]Update download page for v24.04.1 [skip ci]| -|[#10778](https://github.com/NVIDIA/spark-rapids/pull/10778)|Update version to 24.04.1-SNAPSHOT| -|[#10777](https://github.com/NVIDIA/spark-rapids/pull/10777)|Update rapids JNI dependency: private to 24.04.2| -|[#10683](https://github.com/NVIDIA/spark-rapids/pull/10683)|Update latest changelog [skip ci]| -|[#10681](https://github.com/NVIDIA/spark-rapids/pull/10681)|Update rapids JNI dependency to 24.04.0, private to 24.04.1| -|[#10660](https://github.com/NVIDIA/spark-rapids/pull/10660)|Ensure an executor broadcast is in a single batch| -|[#10676](https://github.com/NVIDIA/spark-rapids/pull/10676)|[DOC] Update docs for 24.04.0 release [skip ci]| -|[#10654](https://github.com/NVIDIA/spark-rapids/pull/10654)|Add a config to switch back to old impl for getJsonObject| -|[#10667](https://github.com/NVIDIA/spark-rapids/pull/10667)|Update rapids private dependency to 24.04.1| -|[#10664](https://github.com/NVIDIA/spark-rapids/pull/10664)|Remove build link from the premerge-CI workflow| -|[#10657](https://github.com/NVIDIA/spark-rapids/pull/10657)|Revert "Host Memory OOM handling for RowToColumnarIterator (#10617)"| -|[#10625](https://github.com/NVIDIA/spark-rapids/pull/10625)|Pin to 3.1.0 maven-gpg-plugin in deploy script [skip ci]| -|[#10637](https://github.com/NVIDIA/spark-rapids/pull/10637)|Cleanup async state when multi-threaded shuffle readers fail| -|[#10617](https://github.com/NVIDIA/spark-rapids/pull/10617)|Host Memory OOM handling for RowToColumnarIterator| -|[#10614](https://github.com/NVIDIA/spark-rapids/pull/10614)|Use random seed for `test_from_json_struct_decimal`| -|[#10581](https://github.com/NVIDIA/spark-rapids/pull/10581)|Use new jni kernel for getJsonObject| -|[#10630](https://github.com/NVIDIA/spark-rapids/pull/10630)|Fix removal of internal metadata information in 350 shim| -|[#10623](https://github.com/NVIDIA/spark-rapids/pull/10623)|Auto merge PRs to branch-24.06 from branch-24.04 [skip ci]| -|[#10616](https://github.com/NVIDIA/spark-rapids/pull/10616)|Pass metadata extractors to FileScanRDD| -|[#10620](https://github.com/NVIDIA/spark-rapids/pull/10620)|Remove unused shared lib in Jenkins files| -|[#10615](https://github.com/NVIDIA/spark-rapids/pull/10615)|Turn off state logging in HostAllocSuite| -|[#10610](https://github.com/NVIDIA/spark-rapids/pull/10610)|Do not replace TableCacheQueryStageExec| -|[#10599](https://github.com/NVIDIA/spark-rapids/pull/10599)|Call globStatus directly via PY4J in hdfs_glob to avoid calling hadoop command| -|[#10602](https://github.com/NVIDIA/spark-rapids/pull/10602)|Remove InMemoryTableScanExec support for Spark 3.5+| -|[#10608](https://github.com/NVIDIA/spark-rapids/pull/10608)|Update perfio.s3.enabled doc to fix build failure [skip ci]| -|[#10598](https://github.com/NVIDIA/spark-rapids/pull/10598)|Update CI script to build and deploy using the same CUDA classifier[skip ci]| -|[#10575](https://github.com/NVIDIA/spark-rapids/pull/10575)|Update JsonToStructs and ScanJson to have white space normalization| -|[#10597](https://github.com/NVIDIA/spark-rapids/pull/10597)|add guardword to hide cloud info| -|[#10540](https://github.com/NVIDIA/spark-rapids/pull/10540)|Handle minimum GPU architecture supported| -|[#10584](https://github.com/NVIDIA/spark-rapids/pull/10584)|Add in small optimization for instr comparison| -|[#10590](https://github.com/NVIDIA/spark-rapids/pull/10590)|Turn on transition logging in HostAllocSuite| -|[#10572](https://github.com/NVIDIA/spark-rapids/pull/10572)|Improve performance of Sort for the common single batch use case| -|[#10568](https://github.com/NVIDIA/spark-rapids/pull/10568)|Add configuration to share JNI pinned pool with cuIO| -|[#10550](https://github.com/NVIDIA/spark-rapids/pull/10550)|Enable window-group-limit optimization on| -|[#10542](https://github.com/NVIDIA/spark-rapids/pull/10542)|Make JSON parsing common between JsonToStructs and ScanJson| -|[#10562](https://github.com/NVIDIA/spark-rapids/pull/10562)|Fix test_spark_from_json_date_with_format when run in a non-UTC TZ| -|[#10564](https://github.com/NVIDIA/spark-rapids/pull/10564)|Enable specifying specific integration test methods via TESTS environment| -|[#10563](https://github.com/NVIDIA/spark-rapids/pull/10563)|Append new authorized user to blossom-ci safelist [skip ci]| -|[#10520](https://github.com/NVIDIA/spark-rapids/pull/10520)|Distinct left join| -|[#10538](https://github.com/NVIDIA/spark-rapids/pull/10538)|Move K8s cloud name into common lib for Jenkins CI| -|[#10552](https://github.com/NVIDIA/spark-rapids/pull/10552)|Fix issues when no value can be extracted from a regular expression| -|[#10522](https://github.com/NVIDIA/spark-rapids/pull/10522)|Fix missing scala-parser-combinators dependency on Databricks| -|[#10549](https://github.com/NVIDIA/spark-rapids/pull/10549)|Update to latest branch-24.02 [skip ci]| -|[#10544](https://github.com/NVIDIA/spark-rapids/pull/10544)|Fix merge conflict from branch-24.02| -|[#10503](https://github.com/NVIDIA/spark-rapids/pull/10503)|Distinct inner join| -|[#10512](https://github.com/NVIDIA/spark-rapids/pull/10512)|Move to parsing from_json input preserving quoted strings.| -|[#10528](https://github.com/NVIDIA/spark-rapids/pull/10528)|Fix auto merge conflict 10523| -|[#10519](https://github.com/NVIDIA/spark-rapids/pull/10519)|Replicate HostColumnVector.ColumnBuilder in plugin to enable host memory oom work| -|[#10521](https://github.com/NVIDIA/spark-rapids/pull/10521)|Fix Spark 3.5.1 build| -|[#10516](https://github.com/NVIDIA/spark-rapids/pull/10516)|One more metric for expand| -|[#10500](https://github.com/NVIDIA/spark-rapids/pull/10500)|Support "WindowGroupLimit" optimization on GPU| -|[#10508](https://github.com/NVIDIA/spark-rapids/pull/10508)|Move 351 shims into noSnapshot buildvers| -|[#10510](https://github.com/NVIDIA/spark-rapids/pull/10510)|Fix scalar leak in SumBinaryFixer| -|[#10466](https://github.com/NVIDIA/spark-rapids/pull/10466)|Use parser from spark to normalize json path in GetJsonObject| -|[#10490](https://github.com/NVIDIA/spark-rapids/pull/10490)|Start working on a more complete json test matrix json| -|[#10497](https://github.com/NVIDIA/spark-rapids/pull/10497)|Add minValue overflow check in ORC double-to-timestamp cast| -|[#10501](https://github.com/NVIDIA/spark-rapids/pull/10501)|Fix scalar leak in WindowRetrySuite| -|[#10474](https://github.com/NVIDIA/spark-rapids/pull/10474)|Remove Support for Databricks 10.4| -|[#10418](https://github.com/NVIDIA/spark-rapids/pull/10418)|Enable GpuShuffledSymmetricHashJoin by default| -|[#10450](https://github.com/NVIDIA/spark-rapids/pull/10450)|Improve internal row to columnar host memory by using a combined spillable buffer| -|[#10440](https://github.com/NVIDIA/spark-rapids/pull/10440)|Generate CSV data per Spark version for tools| -|[#10449](https://github.com/NVIDIA/spark-rapids/pull/10449)|[DOC] Fix table rendering issue in github.io download UI page [skip ci]| -|[#10438](https://github.com/NVIDIA/spark-rapids/pull/10438)|Integrate perfio.s3 reader| -|[#10423](https://github.com/NVIDIA/spark-rapids/pull/10423)|Disable Integration Test:`test_get_json_object_single_quotes` on DB 10.4| -|[#10419](https://github.com/NVIDIA/spark-rapids/pull/10419)|Export TZ in tests when default TZ is used| -|[#10426](https://github.com/NVIDIA/spark-rapids/pull/10426)|Fix auto merge conflict 10425 [skip ci]| -|[#10427](https://github.com/NVIDIA/spark-rapids/pull/10427)|Update test doc for 24.04 [skip ci]| -|[#10396](https://github.com/NVIDIA/spark-rapids/pull/10396)|Remove inactive user from github workflow [skip ci]| -|[#10421](https://github.com/NVIDIA/spark-rapids/pull/10421)|Use withRetry when manifesting spillable batch in GpuShuffledHashJoinExec| -|[#10420](https://github.com/NVIDIA/spark-rapids/pull/10420)|Disable JsonTuple by default| -|[#10407](https://github.com/NVIDIA/spark-rapids/pull/10407)|Enable Single Quote Support in getJSONObject API with GetJsonObjectOptions| -|[#10415](https://github.com/NVIDIA/spark-rapids/pull/10415)|Avoid comparing Delta logs when writing partitioned tables| -|[#10247](https://github.com/NVIDIA/spark-rapids/pull/10247)|Improve `GpuExpand` by pre-projecting some columns| -|[#10248](https://github.com/NVIDIA/spark-rapids/pull/10248)|Group-by aggregation based optimization for UNBOUNDED `collect_set` window function| -|[#10406](https://github.com/NVIDIA/spark-rapids/pull/10406)|Enabled subPage chunking by default| -|[#10361](https://github.com/NVIDIA/spark-rapids/pull/10361)|Add in basic support for JSON generation in BigDataGen and improve performance of from_json| -|[#10158](https://github.com/NVIDIA/spark-rapids/pull/10158)|Add in framework for unbounded to unbounded window agg optimization| -|[#10394](https://github.com/NVIDIA/spark-rapids/pull/10394)|Fix auto merge conflict 10393 [skip ci]| -|[#10375](https://github.com/NVIDIA/spark-rapids/pull/10375)|Support barrier mode for mapInPandas/mapInArrow| -|[#10356](https://github.com/NVIDIA/spark-rapids/pull/10356)|Update locate_parquet_testing_files function to support hdfs input path for dataproc CI| -|[#10369](https://github.com/NVIDIA/spark-rapids/pull/10369)|Revert "Support barrier mode for mapInPandas/mapInArrow (#10364)"| -|[#10358](https://github.com/NVIDIA/spark-rapids/pull/10358)|Disable Spark UI by default for integration tests| -|[#10360](https://github.com/NVIDIA/spark-rapids/pull/10360)|Fix a memory leak in json tuple| -|[#10364](https://github.com/NVIDIA/spark-rapids/pull/10364)|Support barrier mode for mapInPandas/mapInArrow| -|[#10348](https://github.com/NVIDIA/spark-rapids/pull/10348)|Remove redundant joinOutputRows metric| -|[#10321](https://github.com/NVIDIA/spark-rapids/pull/10321)|Bump up dependency version to 24.04.0-SNAPSHOT| -|[#10330](https://github.com/NVIDIA/spark-rapids/pull/10330)|Add tryAcquire to GpuSemaphore| -|[#10258](https://github.com/NVIDIA/spark-rapids/pull/10258)|Init project version 24.04.0-SNAPSHOT| - ## Older Releases Changelog of older releases can be found at [docs/archives](/docs/archives) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 295006be49c..430af075782 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -50,11 +50,11 @@ mvn verify After a successful build, the RAPIDS Accelerator jar will be in the `dist/target/` directory. This will build the plugin for a single version of Spark. By default, this is Apache Spark -3.1.1. To build against other versions of Spark you use the `-Dbuildver=XXX` command line option -to Maven. For instance to build Spark 3.1.1 you would use: +3.2.0. To build against other versions of Spark you use the `-Dbuildver=XXX` command line option +to Maven. For instance to build Spark 3.2.0 you would use: ```shell script -mvn -Dbuildver=311 verify +mvn -Dbuildver=320 verify ``` You can find all available build versions in the top level pom.xml file. If you are building for Databricks then you should use the `jenkins/databricks/build.sh` script and modify it for @@ -92,7 +92,7 @@ CodeCache: size=245760Kb used=236139Kb max_used=243799Kb free=9620Kb compilation: disabled (not enough contiguous free space left) ``` -It can be mitigated by increasing [ReservedCodeCacheSize](https://spark.apache.org/docs/3.3.1/building-spark.html#setting-up-mavens-memory-usage) +It can be mitigated by increasing [ReservedCodeCacheSize](https://spark.apache.org/docs/latest/building-spark.html#setting-up-mavens-memory-usage) passed in the `MAVEN_OPTS` environment variable. ### Building a Distribution for Multiple Versions of Spark @@ -110,7 +110,7 @@ If you want to create a jar with multiple versions we have the following options 3. Build for all Apache Spark versions, CDH and Databricks with no SNAPSHOT versions of Spark, only released. Use `-PnoSnaphsotsWithDatabricks`. 4. Build for all Apache Spark versions, CDH and Databricks including SNAPSHOT versions of Spark we have supported for. Use `-PsnapshotsWithDatabricks` 5. Build for an arbitrary combination of comma-separated build versions using `-Dincluded_buildvers=`. - E.g., `-Dincluded_buildvers=312,330` + E.g., `-Dincluded_buildvers=320,330` You must first build each of the versions of Spark and then build one final time using the profile for the option you want. @@ -118,9 +118,6 @@ You can also install some manually and build a combined jar. For instance to bui ```shell script mvn clean -mvn -Dbuildver=311 install -Drat.skip=true -DskipTests -mvn -Dbuildver=312 install -Drat.skip=true -DskipTests -mvn -Dbuildver=313 install -Drat.skip=true -DskipTests mvn -Dbuildver=320 install -Drat.skip=true -DskipTests mvn -Dbuildver=321 install -Drat.skip=true -DskipTests mvn -Dbuildver=321cdh install -Drat.skip=true -DskipTests @@ -130,15 +127,15 @@ mvn -pl dist -PnoSnapshots package -DskipTests Verify that shim-specific classes are hidden from a conventional classloader. ```bash -$ javap -cp dist/target/rapids-4-spark_2.12-24.08.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl +$ javap -cp dist/target/rapids-4-spark_2.12-24.10.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl Error: class not found: com.nvidia.spark.rapids.shims.SparkShimImpl ``` However, its bytecode can be loaded if prefixed with `spark3XY` not contained in the package name ```bash -$ javap -cp dist/target/rapids-4-spark_2.12-24.08.0-SNAPSHOT-cuda11.jar spark320.com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 -Warning: File dist/target/rapids-4-spark_2.12-24.08.0-SNAPSHOT-cuda11.jar(/spark320/com/nvidia/spark/rapids/shims/SparkShimImpl.class) does not contain class spark320.com.nvidia.spark.rapids.shims.SparkShimImpl +$ javap -cp dist/target/rapids-4-spark_2.12-24.10.0-SNAPSHOT-cuda11.jar spark320.com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 +Warning: File dist/target/rapids-4-spark_2.12-24.10.0-SNAPSHOT-cuda11.jar(/spark320/com/nvidia/spark/rapids/shims/SparkShimImpl.class) does not contain class spark320.com.nvidia.spark.rapids.shims.SparkShimImpl Compiled from "SparkShims.scala" public final class com.nvidia.spark.rapids.shims.SparkShimImpl { ``` @@ -150,9 +147,9 @@ There is a build script `build/buildall` that automates the local build process. By default, it builds everything that is needed to create a distribution jar for all released (noSnapshots) Spark versions except for Databricks. Other profiles that you can pass using `--profile=` include - `snapshots` that includes all released (noSnapshots) and snapshots Spark versions except for Databricks -- `minimumFeatureVersionMix` that currently includes 321cdh, 312, 320, 330 is recommended for catching incompatibilities already in the local development cycle +- `minimumFeatureVersionMix` that currently includes 321cdh, 320, 330 is recommended for catching incompatibilities already in the local development cycle -For initial quick iterations we can use `--profile=` to build a single-shim version. e.g., `--profile=311` for Spark 3.1.1. +For initial quick iterations we can use `--profile=` to build a single-shim version. e.g., `--profile=320` for Spark 3.2.0. The option `--module=` allows to limit the number of build steps. When iterating, we often don't have the need for the entire build. We may be interested in building everything necessary just to run integration tests (`--module=integration_tests`), or we may want to just rebuild the distribution jar (`--module=dist`) @@ -181,7 +178,7 @@ mvn package -pl dist -am -Dbuildver=340 -DallowConventionalDistJar=true Verify `com.nvidia.spark.rapids.shims.SparkShimImpl` is conventionally loadable: ```bash -$ javap -cp dist/target/rapids-4-spark_2.12-24.08.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 +$ javap -cp dist/target/rapids-4-spark_2.12-24.10.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 Compiled from "SparkShims.scala" public final class com.nvidia.spark.rapids.shims.SparkShimImpl { ``` @@ -201,7 +198,7 @@ NOTE: Build process does not require an ARM machine, so if you want to build the on X86 machine, please also add `-DskipTests` in commands. ```bash -mvn clean verify -Dbuildver=311 -Parm64 +mvn clean verify -Dbuildver=320 -Parm64 ``` ### Iterative development during local testing @@ -377,7 +374,7 @@ the symlink `.bloop` to point to the corresponding directory `.bloop-spark3XY` Example usage: ```Bash -./build/buildall --generate-bloop --profile=311,330 +./build/buildall --generate-bloop --profile=320,330 rm -vf .bloop ln -s .bloop-spark330 .bloop ``` @@ -414,7 +411,7 @@ Install [Scala Metals extension](https://scalameta.org/metals/docs/editors/vscod either locally or into a Remote-SSH extension destination depending on your target environment. When your project folder is open in VS Code, it may prompt you to import Maven project. IMPORTANT: always decline with "Don't ask again", otherwise it will overwrite the Bloop projects -generated with the default `311` profile. If you need to use a different profile, always rerun the +generated with the default `320` profile. If you need to use a different profile, always rerun the command above manually. When regenerating projects it's recommended to proceed to Metals "Build commands" View, and click: 1. "Restart build server" diff --git a/README.md b/README.md index 297e6eacb2f..789219dde27 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ as a `provided` dependency. com.nvidia rapids-4-spark_2.12 - 24.08.0-SNAPSHOT + 24.10.0-SNAPSHOT provided ``` diff --git a/aggregator/pom.xml b/aggregator/pom.xml index 8cf881419c9..9ba0a1088e5 100644 --- a/aggregator/pom.xml +++ b/aggregator/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.08.0-SNAPSHOT + 24.10.0-SNAPSHOT ../jdk-profiles/pom.xml rapids-4-spark-aggregator_2.12 RAPIDS Accelerator for Apache Spark Aggregator Creates an aggregated shaded package of the RAPIDS plugin for Apache Spark - 24.08.0-SNAPSHOT + 24.10.0-SNAPSHOT aggregator @@ -37,12 +37,14 @@ and they are auto-deduped using binary diff --> com.nvidia.shaded.spark + com/nvidia/shaded/spark false none **/* initialize initialize + none @@ -86,10 +88,21 @@ + + + true + + + + + META-INF/versions/(\d+)/org/roaringbitmap + META-INF/versions/$1/${rapids.shade.package.path}/org/roaringbitmap + true + com.google.flatbuffers ${rapids.shade.package}.com.google.flatbuffers @@ -252,79 +265,11 @@ - release311 + release320 true - - buildver - 311 - - - - - com.nvidia - rapids-4-spark-delta-stub_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release312 - - - buildver - 312 - - - - - com.nvidia - rapids-4-spark-delta-stub_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release313 - - - buildver - 313 - - - - - com.nvidia - rapids-4-spark-delta-stub_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release314 - - - buildver - 314 - - - - - com.nvidia - rapids-4-spark-delta-stub_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release320 - buildver 320 @@ -783,5 +728,41 @@ + + release352 + + + buildver + 352 + + + + + com.nvidia + rapids-4-spark-delta-stub_${scala.binary.version} + ${project.version} + ${spark.version.classifier} + + + + diff --git a/api_validation/README.md b/api_validation/README.md index 5f34354656d..482b3e76a58 100644 --- a/api_validation/README.md +++ b/api_validation/README.md @@ -21,7 +21,7 @@ cd api_validation sh auditAllVersions.sh // To run script on particular version we can use profile -mvn scala:run -P spark311 +mvn scala:run -P spark320 ``` # Output diff --git a/api_validation/auditAllVersions.sh b/api_validation/auditAllVersions.sh index 5deddacec65..27aeedcd4ba 100644 --- a/api_validation/auditAllVersions.sh +++ b/api_validation/auditAllVersions.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,4 +14,4 @@ # limitations under the License. set -ex -mvn scala:run -P spark311 +mvn scala:run -P spark320 diff --git a/api_validation/pom.xml b/api_validation/pom.xml index 7b892754d28..4eec0e2ab02 100644 --- a/api_validation/pom.xml +++ b/api_validation/pom.xml @@ -22,11 +22,11 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.08.0-SNAPSHOT + 24.10.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-api-validation_2.12 - 24.08.0-SNAPSHOT + 24.10.0-SNAPSHOT api_validation diff --git a/api_validation/src/main/scala/com/nvidia/spark/rapids/api/ApiValidation.scala b/api_validation/src/main/scala/com/nvidia/spark/rapids/api/ApiValidation.scala index 58d273d2148..5821c6e18ac 100644 --- a/api_validation/src/main/scala/com/nvidia/spark/rapids/api/ApiValidation.scala +++ b/api_validation/src/main/scala/com/nvidia/spark/rapids/api/ApiValidation.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -69,7 +69,7 @@ object ApiValidation extends Logging { val gpuKeys = gpuExecs.keys var printNewline = false - val sparkToShimMap = Map("3.1.1" -> "spark311") + val sparkToShimMap = Map("3.2.0" -> "spark320") val sparkVersion = ShimLoader.getShimVersion.toString val shimVersion = sparkToShimMap(sparkVersion) diff --git a/build/buildall b/build/buildall index b3c473be141..f9ac0507922 100755 --- a/build/buildall +++ b/build/buildall @@ -91,6 +91,12 @@ function versionsFromDistProfile() { echo -n $versionStr } +function versionsFromReleaseProfiles() { + versionRawStr=$(python build/get_buildvers.py $1 $2) + versionStr=${versionRawStr//[$'\n',]/} + echo -n $versionStr +} + FINAL_OP="package" while [[ "$1" != "" ]] ; do @@ -161,7 +167,6 @@ if [[ "$DIST_PROFILE" == *Scala213 ]]; then SCALA213=1 fi - # include options to mvn command export MVN="mvn -Dmaven.wagon.http.retryHandler.count=3 ${MVN_OPT}" @@ -174,29 +179,30 @@ fi [[ "$MODULE" != "" ]] && MODULE_OPT="--projects $MODULE --also-make" || MODULE_OPT="" +echo "Collecting Spark versions..." case $DIST_PROFILE in snapshotsScala213) - SPARK_SHIM_VERSIONS=($(versionsFromDistProfile "snapshotsScala213")) + SPARK_SHIM_VERSIONS=($(versionsFromReleaseProfiles "snap_and_no_snap" "scala2.13/pom.xml")) ;; noSnapshotsScala213) - SPARK_SHIM_VERSIONS=($(versionsFromDistProfile "noSnapshotsScala213")) + SPARK_SHIM_VERSIONS=($(versionsFromReleaseProfiles "no_snapshots" "scala2.13/pom.xml")) ;; snapshots?(WithDatabricks)) - SPARK_SHIM_VERSIONS=($(versionsFromDistProfile "snapshots")) + SPARK_SHIM_VERSIONS=($(versionsFromReleaseProfiles "snap_and_no_snap" "pom.xml")) ;; noSnapshots?(WithDatabricks)) - SPARK_SHIM_VERSIONS=($(versionsFromDistProfile "noSnapshots")) + SPARK_SHIM_VERSIONS=($(versionsFromReleaseProfiles "no_snapshots" "pom.xml")) ;; minimumFeatureVersionMix) SPARK_SHIM_VERSIONS=($(versionsFromDistProfile "minimumFeatureVersionMix")) ;; - 3*) + [34]*) <<< $DIST_PROFILE IFS="," read -ra SPARK_SHIM_VERSIONS INCLUDED_BUILDVERS_OPT="-Dincluded_buildvers=$DIST_PROFILE" unset DIST_PROFILE @@ -216,8 +222,6 @@ if [[ "$GEN_BLOOP" == "true" ]]; then exit 0 fi -[[ "$DIST_PROFILE" != "" ]] && MVN_PROFILE_OPT="-P$DIST_PROFILE" || MVN_PROFILE_OPT="" - # First element in SPARK_SHIM_VERSIONS to do most of the checks export BASE_VER=${SPARK_SHIM_VERSIONS[0]} export NUM_SHIMS=${#SPARK_SHIM_VERSIONS[@]} @@ -274,8 +278,8 @@ export -f build_single_shim # Install all the versions for DIST_PROFILE # First build the aggregator module for all SPARK_SHIM_VERSIONS in parallel skipping expensive plugins that -# - either deferred to 311 because the check is identical in all shim profiles such as scalastyle -# - or deferred to 311 because we currently don't require it per shim such as scaladoc generation +# - either deferred to 320 because the check is identical in all shim profiles such as scalastyle +# - or deferred to 320 because we currently don't require it per shim such as scaladoc generation # - or there is a dedicated step to run against a particular shim jar such as unit tests, in # the near future we will run unit tests against a combined multi-shim jar to catch classloading # regressions even before pytest-based integration_tests @@ -296,12 +300,12 @@ time ( fi # This used to resume from dist. However, without including aggregator in the build # the build does not properly initialize spark.version property via buildver profiles - # in the root pom, and we get a missing spark311 dependency even for --profile=312,321 + # in the root pom, and we get a missing spark320 dependency even for --profile=320,321 # where the build does not require it. Moving it to aggregator resolves this issue with # a negligible increase of the build time by ~2 seconds. joinShimBuildFrom="aggregator" echo "Resuming from $joinShimBuildFrom build only using $BASE_VER" - $MVN $FINAL_OP -rf $joinShimBuildFrom $MODULE_OPT $MVN_PROFILE_OPT $INCLUDED_BUILDVERS_OPT \ + $MVN $FINAL_OP -rf $joinShimBuildFrom $MODULE_OPT $INCLUDED_BUILDVERS_OPT \ -Dbuildver="$BASE_VER" \ -DskipTests -Dmaven.scaladoc.skip ) diff --git a/build/coverage-report b/build/coverage-report index 69c96c050dc..ddde86b6aea 100755 --- a/build/coverage-report +++ b/build/coverage-report @@ -1,7 +1,7 @@ #!/bin/bash # -# Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,7 +23,7 @@ TMP_CLASS=${TEMP_CLASS_LOC:-"./target/jacoco_classes/"} HTML_LOC=${HTML_LOCATION:="./target/jacoco-report/"} XML_LOC=${XML_LOCATION:="${HTML_LOC}"} DIST_JAR=${RAPIDS_DIST_JAR:-$(ls ./dist/target/rapids-4-spark_2.12-*cuda*.jar | grep -v test | head -1 | xargs readlink -f)} -SPK_VER=${JACOCO_SPARK_VER:-"311"} +SPK_VER=${JACOCO_SPARK_VER:-"320"} UDF_JAR=${RAPIDS_UDF_JAR:-$(ls ./udf-compiler/target/spark${SPK_VER}/rapids-4-spark-udf_2.12-*-SNAPSHOT-spark${SPK_VER}.jar | grep -v test | head -1 | xargs readlink -f)} SOURCE_DIRS=${SOURCE_DIRS:-"./sql-plugin/src/main/scala/:./sql-plugin/src/main/java/:./shuffle-plugin/src/main/scala/:./udf-compiler/src/main/scala/"} @@ -33,11 +33,11 @@ SOURCE_WITH_ARGS="--sourcefiles "$(echo $SOURCE_DIRS | sed -e 's/:/ --sourcefile rm -rf "$TMP_CLASS" mkdir -p "$TMP_CLASS" pushd "$TMP_CLASS" -jar xf "$DIST_JAR" com org rapids spark3xx-common "spark${SPK_VER}/" +jar xf "$DIST_JAR" com org rapids spark-shared "spark${SPK_VER}/" # extract the .class files in udf jar and replace the existing ones in spark3xx-ommon and spark$SPK_VER # because the class files in udf jar will be modified in aggregator's shade phase jar xf "$UDF_JAR" com/nvidia/spark/udf -rm -rf com/nvidia/shaded/ org/openucx/ spark3xx-common/com/nvidia/spark/udf/ spark${SPK_VER}/com/nvidia/spark/udf/ +rm -rf com/nvidia/shaded/ org/openucx/ spark-shared/com/nvidia/spark/udf/ spark${SPK_VER}/com/nvidia/spark/udf/ popd if [ ! -f "$JDEST" ]; then diff --git a/build/dyn_shim_detection.py b/build/dyn_shim_detection.py new file mode 100644 index 00000000000..3282ad645df --- /dev/null +++ b/build/dyn_shim_detection.py @@ -0,0 +1,35 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import sys + +_log = logging.getLogger("dyn-shim-detection") +# This script is called by maven's antrun plugin. The `project` variable is set by antrun which contains all the +# properties that are set in the pom.xml. For more details checkout the documentation of the `script` task +# https://ant.apache.org/manual/Tasks/script.html +show_version_info = project.getProperty("dyn.shim.trace") +_log.setLevel(logging.DEBUG if show_version_info else logging.INFO) +# Same as shimplify.py +ch = logging.StreamHandler() +ch.setFormatter(logging.Formatter('%(name)s - %(levelname)s - %(message)s')) +_log.addHandler(ch) +spark_rapids_source_basedir = project.getProperty("spark.rapids.source.basedir") +multi_module_project_dir = project.getProperty("spark.rapids.project.basedir") +buildvers = project.getProperty("dyn.shim.buildver") + +sys.path.append("{}/build/".format(spark_rapids_source_basedir)) +from get_buildvers import _get_buildvers +value = _get_buildvers(buildvers, "{}/pom.xml".format(multi_module_project_dir), _log) +project.setProperty("included_buildvers", value) \ No newline at end of file diff --git a/build/get_buildvers.py b/build/get_buildvers.py new file mode 100644 index 00000000000..bfce9656054 --- /dev/null +++ b/build/get_buildvers.py @@ -0,0 +1,73 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import xml.etree.ElementTree as ET + + +def _get_buildvers(buildvers, pom_file, logger=None): + pom = ET.parse(pom_file) + ns = {"pom": "http://maven.apache.org/POM/4.0.0"} + releases = [] + release_prefix = "release" + for profile in pom.findall(".//pom:profile/pom:id", ns): + if profile.text.startswith(release_prefix): + releases.append(profile.text[len(release_prefix):]) + snapshots = [] + no_snapshots = [] + + for release in releases: + spark_version = pom.find(".//pom:spark{}.version".format(release), ns) + if spark_version.text.endswith("SNAPSHOT"): + snapshots.append(release) + else: + no_snapshots.append(release) + excluded_shims = pom.find(".//pom:dyn.shim.excluded.releases", ns) + if excluded_shims: + for removed_shim in [x.strip() for x in excluded_shims.text.split(",")]: + if removed_shim in snapshots: + snapshots.remove(removed_shim) + elif removed_shim in no_snapshots: + no_snapshots.remove(removed_shim) + else: + raise Exception( + "Shim {} listed in dyn.shim.excluded.releases in pom.xml not present in releases".format( + removed_shim)) + + if "scala2.13" in pom_file: + no_snapshots = list(filter(lambda x: not x.endswith("cdh"), no_snapshots)) + + db_release = list(filter(lambda x: x.endswith("db"), no_snapshots)) + no_snapshots = list(filter(lambda x: not x.endswith("db"), no_snapshots)) + snap_and_no_snap = no_snapshots + snapshots + snap_and_no_snap_with_db = snap_and_no_snap + db_release + no_snap_with_db = no_snapshots + db_release + all_buildvers = snap_and_no_snap + db_release + release_dict = {"databricks": ", ".join(db_release), "snapshots": ", ".join(snapshots), + "no_snapshots": ", ".join(no_snapshots), + "no_snap_with_databricks": ", ".join(no_snap_with_db), + "snap_and_no_snap_with_databricks": ", ".join(snap_and_no_snap_with_db), + "snap_and_no_snap": ", ".join(snap_and_no_snap), "all.buildvers": ", ".join(all_buildvers)} + if logger: + logger.debug("release_dict: {}".format(release_dict)) + if buildvers: + return release_dict[buildvers] + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("usage: get_buildvers.py ") + print(" supported buildvers: databricks, no_snapshots, ...") + else: + print(_get_buildvers(sys.argv[1], sys.argv[2])) diff --git a/build/make-scala-version-build-files.sh b/build/make-scala-version-build-files.sh index c1ca50b0551..ad3482ee979 100755 --- a/build/make-scala-version-build-files.sh +++ b/build/make-scala-version-build-files.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash # -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,7 +20,7 @@ set -e VALID_VERSIONS=( 2.13 ) declare -A DEFAULT_SPARK -DEFAULT_SPARK[2.12]="spark311" +DEFAULT_SPARK[2.12]="spark320" DEFAULT_SPARK[2.13]="spark330" usage() { @@ -94,4 +94,4 @@ sed_i '//,/[0-9]*\.[0-9]*/,/[0-9]*\.[0-9]*\.[0-9]*[0-9]*\.[0-9]*\.[0-9]*'$SCALA_VERSION' -You can find all shim files for a particular shim, e.g. 312, easily by executing: -git grep '{"spark": "312"}' '*.java' '*.scala' +You can find all shim files for a particular shim, e.g. 320, easily by executing: +git grep '{"spark": "320"}' '*.java' '*.scala' """ import errno @@ -187,13 +187,13 @@ def __csv_as_arr(str_val): __shims_arr = sorted(__csv_ant_prop_as_arr('shimplify.shims')) __dirs_to_derive_shims = sorted(__csv_ant_prop_as_arr('shimplify.dirs')) -__all_shims_arr = sorted(__csv_ant_prop_as_arr('all.buildvers')) - __log = logging.getLogger('shimplify') __log.setLevel(logging.DEBUG if __should_trace else logging.INFO) __ch = logging.StreamHandler() __ch.setFormatter(logging.Formatter('%(name)s - %(levelname)s - %(message)s')) __log.addHandler(__ch) +__all_shims_arr = sorted(__ant_proj_prop("included_buildvers").split(", ")) +__shims_arr = __all_shims_arr if not __shims_arr else __shims_arr __shim_dir_pattern = re.compile(r'spark\d{3}') __shim_comment_pattern = re.compile(re.escape(__opening_shim_tag) + @@ -370,11 +370,6 @@ def __generate_symlinks(): path, build_ver_arr)) -def __map_version_array(shim_json_string): - shim_ver = str(json.loads(shim_json_string).get('spark')) - assert shim_ver in __all_shims_arr, "all.buildvers in pom.xml does not contain %s" % shim_ver - return shim_ver - def __traverse_source_tree_of_all_shims(src_type, func): """Walks src//sparkXYZ""" base_dir = __src_basedir @@ -394,7 +389,7 @@ def __traverse_source_tree_of_all_shims(src_type, func): shim_arr = shim_match.group(1).split(os.linesep) assert len(shim_arr) > 0, "invalid empty shim comment,"\ "orphan shim files should be deleted" - build_ver_arr = map(__map_version_array, shim_arr) + build_ver_arr = map(lambda x: str(json.loads(x).get('spark')), shim_arr) __log.debug("extracted shims %s", build_ver_arr) assert build_ver_arr == sorted(build_ver_arr),\ "%s shim list is not properly sorted" % shim_file_path @@ -500,7 +495,7 @@ def __update_files2bv(files2bv, path, buildver_arr): def __add_new_shim_to_file_map(files2bv): if __add_shim_buildver not in __all_shims_arr: - __log.warning("Update pom.xml to add %s to all.buildvers", __add_shim_buildver) + __log.warning("all.buildvers doesn't contain %s please look at build/get_buildvers.py", __add_shim_buildver) if __add_shim_buildver not in __shims_arr: # TODO should we just bail and ask the user to add to all.buildvers manually first? __shims_arr.append(__add_shim_buildver) diff --git a/datagen/README.md b/datagen/README.md index 0bcd572e65e..300b5d331c0 100644 --- a/datagen/README.md +++ b/datagen/README.md @@ -24,12 +24,12 @@ Where `$SPARK_VERSION` is a compressed version number, like 330 for Spark 3.3.0. After this the jar should be at `target/datagen_2.12-$PLUGIN_VERSION-spark$SPARK_VERSION.jar` -for example a Spark 3.3.0 jar for the 24.08.0 release would be -`target/datagen_2.12-24.08.0-spark330.jar` +for example a Spark 3.3.0 jar for the 24.10.0 release would be +`target/datagen_2.12-24.10.0-spark330.jar` To get a spark shell with this you can run ```shell -spark-shell --jars target/datagen_2.12-24.08.0-spark330.jar +spark-shell --jars target/datagen_2.12-24.10.0-spark330.jar ``` After that you should be good to go. diff --git a/datagen/ScaleTest.md b/datagen/ScaleTest.md index 19ca2d21713..a88bd8c2836 100644 --- a/datagen/ScaleTest.md +++ b/datagen/ScaleTest.md @@ -44,7 +44,7 @@ $SPARK_HOME/bin/spark-submit \ --conf spark.sql.parquet.datetimeRebaseModeInWrite=CORRECTED \ --class com.nvidia.rapids.tests.scaletest.ScaleTestDataGen \ # the main class --jars $SPARK_HOME/examples/jars/scopt_2.12-3.7.1.jar \ # one dependency jar just shipped with Spark under $SPARK_HOME -./target/datagen_2.12-24.08.0-SNAPSHOT-spark332.jar \ +./target/datagen_2.12-24.10.0-SNAPSHOT-spark332.jar \ 1 \ 10 \ parquet \ diff --git a/datagen/pom.xml b/datagen/pom.xml index a18797d049f..ccdd6f90fe5 100644 --- a/datagen/pom.xml +++ b/datagen/pom.xml @@ -21,13 +21,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.08.0-SNAPSHOT + 24.10.0-SNAPSHOT ../shim-deps/pom.xml datagen_2.12 Data Generator Tools for generating large amounts of data - 24.08.0-SNAPSHOT + 24.10.0-SNAPSHOT datagen diff --git a/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala b/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala index 14e0d4e0970..9dc4de9086f 100644 --- a/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala +++ b/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala @@ -29,7 +29,8 @@ import scala.util.Random import org.apache.spark.sql.{Column, DataFrame, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{Expression, XXH64} +import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, UnaryExpression, XXH64} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, DateTimeUtils} import org.apache.spark.sql.functions.{approx_count_distinct, avg, coalesce, col, count, lit, stddev, struct, transform, udf, when} import org.apache.spark.sql.types._ @@ -577,7 +578,8 @@ object DataGen { case class DataGenExpr(child: Expression, override val dataType: DataType, canHaveNulls: Boolean, - f: GeneratorFunction) extends DataGenExprBase { + f: GeneratorFunction) + extends UnaryExpression with ExpectsInputTypes with CodegenFallback { override def nullable: Boolean = canHaveNulls @@ -587,6 +589,9 @@ case class DataGenExpr(child: Expression, val rowLoc = new RowLocation(child.eval(input).asInstanceOf[Long]) f(rowLoc) } + + override def withNewChildInternal(newChild: Expression): Expression = + DataGenExpr(newChild, dataType, canHaveNulls, f) } abstract class CommonDataGen( @@ -2670,7 +2675,9 @@ object ColumnGen { dataType: DataType, nullable: Boolean, gen: GeneratorFunction): Column = { - Column(DataGenExpr(rowNumber.expr, dataType, nullable, gen)) + val rowNumberExpr = DataGenExprShims.columnToExpr(rowNumber) + val expr = DataGenExpr(rowNumberExpr, dataType, nullable, gen) + DataGenExprShims.exprToColumn(expr) } } diff --git a/datagen/src/main/spark320/scala/org/apache/spark/sql/tests/datagen/datagen/DataGenExprBase.scala b/datagen/src/main/spark320/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala similarity index 70% rename from datagen/src/main/spark320/scala/org/apache/spark/sql/tests/datagen/datagen/DataGenExprBase.scala rename to datagen/src/main/spark320/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala index ccbb03c4faa..9134505c2f2 100644 --- a/datagen/src/main/spark320/scala/org/apache/spark/sql/tests/datagen/datagen/DataGenExprBase.scala +++ b/datagen/src/main/spark320/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -37,14 +37,14 @@ {"spark": "343"} {"spark": "350"} {"spark": "351"} -{"spark": "400"} +{"spark": "352"} spark-rapids-shim-json-lines ***/ package org.apache.spark.sql.tests.datagen -import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, UnaryExpression} -import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.Column +import org.apache.spark.sql.catalyst.expressions.Expression -trait DataGenExprBase extends UnaryExpression with ExpectsInputTypes with CodegenFallback { - override def withNewChildInternal(newChild: Expression): Expression = - legacyWithNewChildren(Seq(newChild)) +object DataGenExprShims { + def columnToExpr(c: Column): Expression = c.expr + def exprToColumn(e: Expression): Column = Column(e) } diff --git a/datagen/src/main/spark311/scala/org/apache/spark/sql/tests/datagen/DataGenExprBase.scala b/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala similarity index 66% rename from datagen/src/main/spark311/scala/org/apache/spark/sql/tests/datagen/DataGenExprBase.scala rename to datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala index d50008f7fb7..2884968660d 100644 --- a/datagen/src/main/spark311/scala/org/apache/spark/sql/tests/datagen/DataGenExprBase.scala +++ b/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,13 +15,15 @@ */ /*** spark-rapids-shim-json-lines -{"spark": "311"} -{"spark": "312"} -{"spark": "313"} +{"spark": "400"} spark-rapids-shim-json-lines ***/ package org.apache.spark.sql.tests.datagen -import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, UnaryExpression} -import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.Column +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.internal.ExpressionUtils.{column, expression} -trait DataGenExprBase extends UnaryExpression with ExpectsInputTypes with CodegenFallback +object DataGenExprShims { + def columnToExpr(c: Column): Expression = c + def exprToColumn(e: Expression): Column = e +} diff --git a/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransactionBase.scala b/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransactionBase.scala index 8d5abac35b6..b6e9e11946d 100644 --- a/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransactionBase.scala +++ b/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransactionBase.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * This file was derived from OptimisticTransaction.scala and TransactionalWrite.scala * in the Delta Lake project at https://github.com/delta-io/delta. @@ -100,7 +100,7 @@ abstract class GpuOptimisticTransactionBase GpuAlias(GpuEmpty2Null(p), p.name)() case attr => attr } - if (needConvert) GpuProjectExec(projectList.toList, plan)() else plan + if (needConvert) GpuProjectExec(projectList.toList, plan) else plan } /** diff --git a/delta-lake/common/src/main/databricks/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormatBase.scala b/delta-lake/common/src/main/databricks/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormatBase.scala index 598cf8248f4..f845b321a4a 100644 --- a/delta-lake/common/src/main/databricks/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormatBase.scala +++ b/delta-lake/common/src/main/databricks/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormatBase.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,8 @@ package com.nvidia.spark.rapids.delta -import com.databricks.sql.transaction.tahoe.{DeltaColumnMapping, DeltaColumnMappingMode, NoMapping} +import com.databricks.sql.transaction.tahoe.{DeltaColumnMapping, DeltaColumnMappingMode, NameMapping, NoMapping} +import com.databricks.sql.transaction.tahoe.schema.SchemaMergingUtils import com.nvidia.spark.rapids.{GpuMetric, GpuParquetMultiFilePartitionReaderFactory, GpuReadParquetFileFormat} import org.apache.hadoop.conf.Configuration @@ -27,7 +28,7 @@ import org.apache.spark.sql.connector.read.PartitionReaderFactory import org.apache.spark.sql.execution.datasources.PartitionedFile import org.apache.spark.sql.rapids.GpuFileSourceScanExec import org.apache.spark.sql.sources.Filter -import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.types.{MetadataBuilder, StructType} import org.apache.spark.util.SerializableConfiguration abstract class GpuDeltaParquetFileFormatBase extends GpuReadParquetFileFormat { @@ -35,7 +36,18 @@ abstract class GpuDeltaParquetFileFormatBase extends GpuReadParquetFileFormat { val referenceSchema: StructType def prepareSchema(inputSchema: StructType): StructType = { - DeltaColumnMapping.createPhysicalSchema(inputSchema, referenceSchema, columnMappingMode) + val schema = DeltaColumnMapping.createPhysicalSchema( + inputSchema, referenceSchema, columnMappingMode) + if (columnMappingMode == NameMapping) { + SchemaMergingUtils.transformColumns(schema) { (_, field, _) => + field.copy(metadata = new MetadataBuilder() + .withMetadata(field.metadata) + .remove(DeltaColumnMapping.PARQUET_FIELD_ID_METADATA_KEY) + .build()) + } + } else { + schema + } } override def createMultiFileReaderFactory( diff --git a/delta-lake/common/src/main/delta-io/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormat.scala b/delta-lake/common/src/main/delta-io/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormat.scala index 96cb4b6d552..ad0f9841b39 100644 --- a/delta-lake/common/src/main/delta-io/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormat.scala +++ b/delta-lake/common/src/main/delta-io/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormat.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,11 +23,12 @@ import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.connector.read.PartitionReaderFactory -import org.apache.spark.sql.delta.{DeltaColumnMapping, DeltaColumnMappingMode, NoMapping} +import org.apache.spark.sql.delta.{DeltaColumnMapping, DeltaColumnMappingMode, NameMapping, NoMapping} +import org.apache.spark.sql.delta.schema.SchemaMergingUtils import org.apache.spark.sql.execution.datasources.PartitionedFile import org.apache.spark.sql.rapids.GpuFileSourceScanExec import org.apache.spark.sql.sources.Filter -import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.types.{MetadataBuilder, StructType} import org.apache.spark.util.SerializableConfiguration trait GpuDeltaParquetFileFormat extends GpuReadParquetFileFormat { @@ -35,7 +36,18 @@ trait GpuDeltaParquetFileFormat extends GpuReadParquetFileFormat { val referenceSchema: StructType def prepareSchema(inputSchema: StructType): StructType = { - DeltaColumnMapping.createPhysicalSchema(inputSchema, referenceSchema, columnMappingMode) + val schema = DeltaColumnMapping.createPhysicalSchema( + inputSchema, referenceSchema, columnMappingMode) + if (columnMappingMode == NameMapping) { + SchemaMergingUtils.transformColumns(schema) { (_, field, _) => + field.copy(metadata = new MetadataBuilder() + .withMetadata(field.metadata) + .remove(DeltaColumnMapping.PARQUET_FIELD_ID_METADATA_KEY) + .build()) + } + } else { + schema + } } override def createMultiFileReaderFactory( diff --git a/delta-lake/common/src/main/delta-io/scala/org/apache/spark/sql/delta/rapids/GpuOptimisticTransactionBase.scala b/delta-lake/common/src/main/delta-io/scala/org/apache/spark/sql/delta/rapids/GpuOptimisticTransactionBase.scala index 1d5b5a1f72c..afa5ecbc6ee 100644 --- a/delta-lake/common/src/main/delta-io/scala/org/apache/spark/sql/delta/rapids/GpuOptimisticTransactionBase.scala +++ b/delta-lake/common/src/main/delta-io/scala/org/apache/spark/sql/delta/rapids/GpuOptimisticTransactionBase.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -80,7 +80,7 @@ abstract class GpuOptimisticTransactionBase( GpuAlias(GpuEmpty2Null(p), p.name)() case attr => attr } - if (needConvert) GpuProjectExec(projectList.toList, plan)() else plan + if (needConvert) GpuProjectExec(projectList.toList, plan) else plan } /** diff --git a/delta-lake/common/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormatUtils.scala b/delta-lake/common/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormatUtils.scala index 101a82da830..1ade53b21b9 100644 --- a/delta-lake/common/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormatUtils.scala +++ b/delta-lake/common/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormatUtils.scala @@ -17,8 +17,8 @@ package com.nvidia.spark.rapids.delta import ai.rapids.cudf.{ColumnVector => CudfColumnVector, Scalar, Table} +import com.nvidia.spark.rapids.{GpuColumnVector, GpuMetric} import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource} -import com.nvidia.spark.rapids.GpuColumnVector import org.roaringbitmap.longlong.{PeekableLongIterator, Roaring64Bitmap} import org.apache.spark.sql.types.{BooleanType, LongType, StringType, StructField, StructType} @@ -53,7 +53,9 @@ object GpuDeltaParquetFileFormatUtils { schema: StructType, delVector: Option[Roaring64Bitmap], input: Iterator[ColumnarBatch], - maxBatchSize: Int): Iterator[ColumnarBatch] = { + maxBatchSize: Int, + delVectorScatterTimeMetric: GpuMetric + ): Iterator[ColumnarBatch] = { val metadataRowIndexCol = schema.fieldNames.indexOf(METADATA_ROW_IDX_COL) val delRowIdx = schema.fieldNames.indexOf(METADATA_ROW_DEL_COL) if (metadataRowIndexCol == -1 && delRowIdx == -1) { @@ -74,20 +76,31 @@ object GpuDeltaParquetFileFormatUtils { Some(delRowIdx) } val newBatch = addMetadataColumns(rowIdxCol, delRowIdx2, delVector,maxBatchSize, - rowIndex, batch) + rowIndex, batch, delVectorScatterTimeMetric) rowIndex += batch.numRows() newBatch } } } + private def createFalseTable(numRows: Int): Table = { + withResource(Scalar.fromBool(false)) { s => + withResource(CudfColumnVector.fromScalar(s, numRows)) { c => + new Table(c) + } + } + } + + private def addMetadataColumns( rowIdxPos: Option[Int], delRowIdx: Option[Int], delVec: Option[Roaring64Bitmap], maxBatchSize: Int, rowIdxStart: Long, - batch: ColumnarBatch): ColumnarBatch = { + batch: ColumnarBatch, + delVectorScatterTimeMetric: GpuMetric, + ): ColumnarBatch = { val rowIdxCol = rowIdxPos.map { _ => withResource(Scalar.fromLong(rowIdxStart)) { start => GpuColumnVector.from(CudfColumnVector.sequence(start, batch.numRows()), @@ -98,30 +111,26 @@ object GpuDeltaParquetFileFormatUtils { closeOnExcept(rowIdxCol) { rowIdxCol => val delVecCol = delVec.map { delVec => - withResource(Scalar.fromBool(false)) { s => - withResource(CudfColumnVector.fromScalar(s, batch.numRows())) { c => - var table = new Table(c) - val posIter = new RoaringBitmapIterator( - delVec.getLongIteratorFrom(rowIdxStart), - rowIdxStart, - rowIdxStart + batch.numRows(), - ).grouped(Math.min(maxBatchSize, batch.numRows())) - - for (posChunk <- posIter) { - withResource(CudfColumnVector.fromLongs(posChunk: _*)) { poses => - withResource(Scalar.fromBool(true)) { s => - table = withResource(table) { _ => + delVectorScatterTimeMetric.ns { + val table = new RoaringBitmapIterator( + delVec.getLongIteratorFrom(rowIdxStart), + rowIdxStart, + rowIdxStart + batch.numRows()) + .grouped(Math.min(maxBatchSize, batch.numRows())) + .foldLeft(createFalseTable(batch.numRows())){ (table, posChunk) => + withResource(table) { _ => + withResource(CudfColumnVector.fromLongs(posChunk: _*)) { poses => + withResource(Scalar.fromBool(true)) { s => Table.scatter(Array(s), poses, table) } } } } - withResource(table) { _ => - GpuColumnVector.from(table.getColumn(0).incRefCount(), - METADATA_ROW_DEL_FIELD.dataType) - } - } + withResource(table) { _ => + GpuColumnVector.from(table.getColumn(0).incRefCount(), + METADATA_ROW_DEL_FIELD.dataType) + } } } diff --git a/delta-lake/common/src/main/scala/com/nvidia/spark/rapids/delta/GpuStatisticsCollection.scala b/delta-lake/common/src/main/scala/com/nvidia/spark/rapids/delta/GpuStatisticsCollection.scala index 8573a6979bf..7b246014f67 100644 --- a/delta-lake/common/src/main/scala/com/nvidia/spark/rapids/delta/GpuStatisticsCollection.scala +++ b/delta-lake/common/src/main/scala/com/nvidia/spark/rapids/delta/GpuStatisticsCollection.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * This file was derived from StatisticsCollection.scala * in the Delta Lake project at https://github.com/delta-io/delta. @@ -31,7 +31,7 @@ import com.nvidia.spark.rapids.delta.shims.{ShimDeltaColumnMapping, ShimDeltaUDF import org.apache.spark.sql.{Column, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute -import org.apache.spark.sql.functions.{count, lit, max, min, struct, substring, sum, when} +import org.apache.spark.sql.functions.{count, lit, max, min, struct, sum, when} import org.apache.spark.sql.types._ import org.apache.spark.sql.vectorized.ColumnarBatch @@ -87,7 +87,9 @@ trait GpuStatisticsCollection extends ShimUsesMetadataFields { collectStats(MIN, statCollectionSchema) { // Truncate string min values as necessary case (c, GpuSkippingEligibleDataType(StringType), true) => - substring(min(c), 0, stringPrefixLength) + val udfTruncateMin = ShimDeltaUDF.stringStringUdf( + GpuStatisticsCollection.truncateMinStringAgg(prefixLength)_) + udfTruncateMin(min(c)) // Collect all numeric min values case (c, GpuSkippingEligibleDataType(_), true) => @@ -203,25 +205,76 @@ trait GpuStatisticsCollection extends ShimUsesMetadataFields { } object GpuStatisticsCollection { + val ASCII_MAX_CHARACTER = '\u007F' + + val UTF8_MAX_CHARACTER = new String(Character.toChars(Character.MAX_CODE_POINT)) + + def truncateMinStringAgg(prefixLen: Int)(input: String): String = { + if (input == null || input.length <= prefixLen) { + return input + } + if (prefixLen <= 0) { + return null + } + if (Character.isHighSurrogate(input.charAt(prefixLen - 1)) && + Character.isLowSurrogate(input.charAt(prefixLen))) { + // If the character at prefixLen - 1 is a high surrogate and the next character is a low + // surrogate, we need to include the next character in the prefix to ensure that we don't + // truncate the string in the middle of a surrogate pair. + input.take(prefixLen + 1) + } else { + input.take(prefixLen) + } + } + /** - * Helper method to truncate the input string `x` to the given `prefixLen` length, while also - * appending the unicode max character to the end of the truncated string. This ensures that any - * value in this column is less than or equal to the max. + * Helper method to truncate the input string `input` to the given `prefixLen` length, while also + * ensuring the any value in this column is less than or equal to the truncated max in UTF-8 + * encoding. */ - def truncateMaxStringAgg(prefixLen: Int)(x: String): String = { - if (x == null || x.length <= prefixLen) { - x - } else { - // Grab the prefix. We want to append `\ufffd` as a tie-breaker, but that is only safe - // if the character we truncated was smaller. Keep extending the prefix until that - // condition holds, or we run off the end of the string. - // scalastyle:off nonascii - val tieBreaker = '\ufffd' - x.take(prefixLen) + x.substring(prefixLen).takeWhile(_ >= tieBreaker) + tieBreaker - // scalastyle:off nonascii + def truncateMaxStringAgg(prefixLen: Int)(originalMax: String): String = { + // scalastyle:off nonascii + if (originalMax == null || originalMax.length <= prefixLen) { + return originalMax } + if (prefixLen <= 0) { + return null + } + + // Grab the prefix. We want to append max Unicode code point `\uDBFF\uDFFF` as a tie-breaker, + // but that is only safe if the character we truncated was smaller in UTF-8 encoded binary + // comparison. Keep extending the prefix until that condition holds, or we run off the end of + // the string. + // We also try to use the ASCII max character `\u007F` as a tie-breaker if possible. + val maxLen = getExpansionLimit(prefixLen) + // Start with a valid prefix + var currLen = truncateMinStringAgg(prefixLen)(originalMax).length + while (currLen <= maxLen) { + if (currLen >= originalMax.length) { + // Return originalMax if we have reached the end of the string + return originalMax + } else if (currLen + 1 < originalMax.length && + originalMax.substring(currLen, currLen + 2) == UTF8_MAX_CHARACTER) { + // Skip the UTF-8 max character. It occupies two characters in a Scala string. + currLen += 2 + } else if (originalMax.charAt(currLen) < ASCII_MAX_CHARACTER) { + return originalMax.take(currLen) + ASCII_MAX_CHARACTER + } else { + return originalMax.take(currLen) + UTF8_MAX_CHARACTER + } + } + + // Return null when the input string is too long to truncate. + null + // scalastyle:on nonascii } + /** + * Calculates the upper character limit when constructing a maximum is not possible with only + * prefixLen chars. + */ + private def getExpansionLimit(prefixLen: Int): Int = 2 * prefixLen + def batchStatsToRow( schema: StructType, explodedDataSchema: Map[Seq[String], Int], diff --git a/delta-lake/delta-20x/pom.xml b/delta-lake/delta-20x/pom.xml index dfc34fad987..35760b28288 100644 --- a/delta-lake/delta-20x/pom.xml +++ b/delta-lake/delta-20x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.08.0-SNAPSHOT + 24.10.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-20x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.0.x Support Delta Lake 2.0.x support for the RAPIDS Accelerator for Apache Spark - 24.08.0-SNAPSHOT + 24.10.0-SNAPSHOT ../delta-lake/delta-20x @@ -39,6 +39,10 @@ + + org.roaringbitmap + RoaringBitmap + com.nvidia rapids-4-spark-sql_${scala.binary.version} diff --git a/delta-lake/delta-21x/pom.xml b/delta-lake/delta-21x/pom.xml index 8770a7e3d9d..3c671173415 100644 --- a/delta-lake/delta-21x/pom.xml +++ b/delta-lake/delta-21x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.08.0-SNAPSHOT + 24.10.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-21x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.1.x Support Delta Lake 2.1.x support for the RAPIDS Accelerator for Apache Spark - 24.08.0-SNAPSHOT + 24.10.0-SNAPSHOT ../delta-lake/delta-21x @@ -39,6 +39,10 @@ + + org.roaringbitmap + RoaringBitmap + com.nvidia rapids-4-spark-sql_${scala.binary.version} diff --git a/delta-lake/delta-22x/pom.xml b/delta-lake/delta-22x/pom.xml index be9e122e5df..1e576f97e98 100644 --- a/delta-lake/delta-22x/pom.xml +++ b/delta-lake/delta-22x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.08.0-SNAPSHOT + 24.10.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-22x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.2.x Support Delta Lake 2.2.x support for the RAPIDS Accelerator for Apache Spark - 24.08.0-SNAPSHOT + 24.10.0-SNAPSHOT ../delta-lake/delta-22x @@ -39,6 +39,10 @@ + + org.roaringbitmap + RoaringBitmap + com.nvidia rapids-4-spark-sql_${scala.binary.version} diff --git a/delta-lake/delta-23x/pom.xml b/delta-lake/delta-23x/pom.xml index 97207dca741..f0a488cb5f2 100644 --- a/delta-lake/delta-23x/pom.xml +++ b/delta-lake/delta-23x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-parent_2.12 - 24.08.0-SNAPSHOT + 24.10.0-SNAPSHOT ../../pom.xml rapids-4-spark-delta-23x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.3.x Support Delta Lake 2.3.x support for the RAPIDS Accelerator for Apache Spark - 24.08.0-SNAPSHOT + 24.10.0-SNAPSHOT ../delta-lake/delta-23x @@ -39,6 +39,10 @@ + + org.roaringbitmap + RoaringBitmap + com.nvidia rapids-4-spark-sql_${scala.binary.version} diff --git a/delta-lake/delta-24x/pom.xml b/delta-lake/delta-24x/pom.xml index 19e0be3c90d..883e9de2933 100644 --- a/delta-lake/delta-24x/pom.xml +++ b/delta-lake/delta-24x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.08.0-SNAPSHOT + 24.10.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-24x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.4.x Support Delta Lake 2.4.x support for the RAPIDS Accelerator for Apache Spark - 24.08.0-SNAPSHOT + 24.10.0-SNAPSHOT ../delta-lake/delta-24x @@ -39,6 +39,10 @@ + + org.roaringbitmap + RoaringBitmap + com.nvidia rapids-4-spark-sql_${scala.binary.version} diff --git a/delta-lake/delta-24x/src/main/scala/com/nvidia/spark/rapids/delta/delta24x/GpuDelta24xParquetFileFormat.scala b/delta-lake/delta-24x/src/main/scala/com/nvidia/spark/rapids/delta/delta24x/GpuDelta24xParquetFileFormat.scala index ef579d78e6f..77891864537 100644 --- a/delta-lake/delta-24x/src/main/scala/com/nvidia/spark/rapids/delta/delta24x/GpuDelta24xParquetFileFormat.scala +++ b/delta-lake/delta-24x/src/main/scala/com/nvidia/spark/rapids/delta/delta24x/GpuDelta24xParquetFileFormat.scala @@ -88,15 +88,22 @@ case class GpuDelta24xParquetFileFormat( val maxDelVecScatterBatchSize = RapidsConf .DELTA_LOW_SHUFFLE_MERGE_SCATTER_DEL_VECTOR_BATCH_SIZE .get(sparkSession.sessionState.conf) + val delVecScatterTimeMetric = metrics(GpuMetric.DELETION_VECTOR_SCATTER_TIME) + val delVecSizeMetric = metrics(GpuMetric.DELETION_VECTOR_SIZE) + (file: PartitionedFile) => { val input = dataReader(file) val dv = delVecs.flatMap(_.value.get(new URI(file.filePath.toString()))) - .map(dv => RoaringBitmapWrapper.deserializeFromBytes(dv.descriptor.inlineData).inner) + .map { dv => + delVecSizeMetric += dv.descriptor.inlineData.length + RoaringBitmapWrapper.deserializeFromBytes(dv.descriptor.inlineData).inner + } addMetadataColumnToIterator(prepareSchema(requiredSchema), dv, input.asInstanceOf[Iterator[ColumnarBatch]], - maxDelVecScatterBatchSize) + maxDelVecScatterBatchSize, + delVecScatterTimeMetric) .asInstanceOf[Iterator[InternalRow]] } } diff --git a/delta-lake/delta-spark330db/pom.xml b/delta-lake/delta-spark330db/pom.xml index abc57f793f8..2c086610bbc 100644 --- a/delta-lake/delta-spark330db/pom.xml +++ b/delta-lake/delta-spark330db/pom.xml @@ -21,15 +21,15 @@ com.nvidia - rapids-4-spark-jdk-profiles_2.12 - 24.08.0-SNAPSHOT - ../../jdk-profiles/pom.xml + rapids-4-spark-shim-deps-parent_2.12 + 24.10.0-SNAPSHOT + ../../shim-deps/pom.xml rapids-4-spark-delta-spark330db_2.12 RAPIDS Accelerator for Apache Spark Databricks 11.3 Delta Lake Support Databricks 11.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.08.0-SNAPSHOT + 24.10.0-SNAPSHOT ../delta-lake/delta-spark330db @@ -40,17 +40,14 @@ - com.nvidia - rapids-4-spark-sql_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - provided + org.roaringbitmap + RoaringBitmap com.nvidia - rapids-4-spark-db-bom + rapids-4-spark-sql_${scala.binary.version} ${project.version} - pom + ${spark.version.classifier} provided diff --git a/delta-lake/delta-spark332db/pom.xml b/delta-lake/delta-spark332db/pom.xml index 4511a76be71..d7763b00a00 100644 --- a/delta-lake/delta-spark332db/pom.xml +++ b/delta-lake/delta-spark332db/pom.xml @@ -21,15 +21,15 @@ com.nvidia - rapids-4-spark-jdk-profiles_2.12 - 24.08.0-SNAPSHOT - ../../jdk-profiles/pom.xml + rapids-4-spark-shim-deps-parent_2.12 + 24.10.0-SNAPSHOT + ../../shim-deps/pom.xml rapids-4-spark-delta-spark332db_2.12 RAPIDS Accelerator for Apache Spark Databricks 12.2 Delta Lake Support Databricks 12.2 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.08.0-SNAPSHOT + 24.10.0-SNAPSHOT ../delta-lake/delta-spark332db @@ -40,17 +40,14 @@ - com.nvidia - rapids-4-spark-sql_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - provided + org.roaringbitmap + RoaringBitmap com.nvidia - rapids-4-spark-db-bom + rapids-4-spark-sql_${scala.binary.version} ${project.version} - pom + ${spark.version.classifier} provided diff --git a/delta-lake/delta-spark341db/pom.xml b/delta-lake/delta-spark341db/pom.xml index 6532062af68..ff8b8da6bf0 100644 --- a/delta-lake/delta-spark341db/pom.xml +++ b/delta-lake/delta-spark341db/pom.xml @@ -20,16 +20,16 @@ 4.0.0 - com.nvidia - rapids-4-spark-jdk-profiles_2.12 - 24.08.0-SNAPSHOT - ../../jdk-profiles/pom.xml + com.nvidia + rapids-4-spark-shim-deps-parent_2.12 + 24.10.0-SNAPSHOT + ../../shim-deps/pom.xml rapids-4-spark-delta-spark341db_2.12 RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.08.0-SNAPSHOT + 24.10.0-SNAPSHOT false @@ -38,6 +38,10 @@ + + org.roaringbitmap + RoaringBitmap + com.nvidia rapids-4-spark-sql_${scala.binary.version} @@ -45,221 +49,6 @@ ${spark.version.classifier} provided - - - - org.apache.spark - spark-sql_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-core_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-annotation_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-catalyst_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-network-common_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-network-shuffle_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-launcher_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-unsafe_${scala.binary.version} - ${spark.version} - provided - - - org.apache.avro - avro-mapred - ${spark.version} - provided - - - org.apache.avro - avro - ${spark.version} - provided - - - org.apache.hive - hive-exec - ${spark.version} - provided - - - org.apache.hive - hive-serde - ${spark.version} - provided - - - org.apache.spark - spark-hive_${scala.binary.version} - - - com.fasterxml.jackson.core - jackson-core - ${spark.version} - provided - - - com.fasterxml.jackson.core - jackson-annotations - ${spark.version} - provided - - - org.json4s - json4s-ast_${scala.binary.version} - ${spark.version} - provided - - - org.json4s - json4s-core_${scala.binary.version} - ${spark.version} - provided - - - org.apache.commons - commons-io - ${spark.version} - provided - - - org.scala-lang - scala-reflect - ${scala.version} - provided - - - org.apache.commons - commons-lang3 - ${spark.version} - provided - - - com.esotericsoftware.kryo - kryo-shaded-db - ${spark.version} - provided - - - org.apache.parquet - parquet-hadoop - ${spark.version} - provided - - - org.apache.parquet - parquet-common - ${spark.version} - provided - - - org.apache.parquet - parquet-column - ${spark.version} - provided - - - org.apache.parquet - parquet-format - ${spark.version} - provided - - - org.apache.arrow - arrow-memory - ${spark.version} - provided - - - org.apache.arrow - arrow-vector - ${spark.version} - provided - - - org.apache.hadoop - hadoop-client - ${hadoop.client.version} - provided - - - org.apache.orc - orc-core - ${spark.version} - provided - - - org.apache.orc - orc-shims - ${spark.version} - provided - - - org.apache.orc - orc-mapreduce - ${spark.version} - provided - - - org.apache.hive - hive-storage-api - ${spark.version} - provided - - - com.google.protobuf - protobuf-java - ${spark.version} - provided - - - org.apache.spark - spark-common-utils_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-sql-api_${scala.binary.version} - ${spark.version} - provided - diff --git a/delta-lake/delta-spark341db/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormat.scala b/delta-lake/delta-spark341db/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormat.scala index 604ed826397..e109b81f1e5 100644 --- a/delta-lake/delta-spark341db/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormat.scala +++ b/delta-lake/delta-spark341db/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormat.scala @@ -85,14 +85,21 @@ case class GpuDeltaParquetFileFormat( .DELTA_LOW_SHUFFLE_MERGE_SCATTER_DEL_VECTOR_BATCH_SIZE .get(sparkSession.sessionState.conf) + val delVecScatterTimeMetric = metrics(GpuMetric.DELETION_VECTOR_SCATTER_TIME) + val delVecSizeMetric = metrics(GpuMetric.DELETION_VECTOR_SIZE) + (file: PartitionedFile) => { val input = dataReader(file) val dv = delVecs.flatMap(_.value.get(new URI(file.filePath.toString()))) - .map(dv => RoaringBitmapWrapper.deserializeFromBytes(dv.descriptor.inlineData).inner) + .map { dv => + delVecSizeMetric += dv.descriptor.inlineData.length + RoaringBitmapWrapper.deserializeFromBytes(dv.descriptor.inlineData).inner + } addMetadataColumnToIterator(prepareSchema(requiredSchema), dv, input.asInstanceOf[Iterator[ColumnarBatch]], - maxDelVecScatterBatchSize + maxDelVecScatterBatchSize, + delVecScatterTimeMetric ).asInstanceOf[Iterator[InternalRow]] } } diff --git a/delta-lake/delta-stub/pom.xml b/delta-lake/delta-stub/pom.xml index bfe920e3dc4..e1c841cd9c9 100644 --- a/delta-lake/delta-stub/pom.xml +++ b/delta-lake/delta-stub/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.08.0-SNAPSHOT + 24.10.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-stub_2.12 RAPIDS Accelerator for Apache Spark Delta Lake Stub Delta Lake stub for the RAPIDS Accelerator for Apache Spark - 24.08.0-SNAPSHOT + 24.10.0-SNAPSHOT ../delta-lake/delta-stub diff --git a/dist/README.md b/dist/README.md index 56e5b9be297..2a0955da6dd 100644 --- a/dist/README.md +++ b/dist/README.md @@ -17,21 +17,21 @@ Files are: `com.nvidia.spark.rapids.SparkShimServiceProvider.sparkNonSnapshot`, The new uber jar is structured like: -1. Base common classes are user visible classes. For these we use Spark 3.1.1 versions because they are assumed to be +1. Base common classes are user visible classes. For these we use Spark 3.2.0 versions because they are assumed to be bitwise-identical to the other shims, this assumption is subject to the future automatic validation. 2. META-INF/services. This is a file that has to list all the shim versions supported by this jar. The files talked about above for each profile are put into place here for uber jars. Although we currently do not use [ServiceLoader API](https://docs.oracle.com/javase/8/docs/api/java/util/ServiceLoader.html) we use the same service provider discovery mechanism -3. META-INF base files are from 3.1.1 - maven, LICENSE, NOTICE, etc +3. META-INF base files are from 3.2.0 - maven, LICENSE, NOTICE, etc 4. Spark specific directory (aka Parallel World in the jargon of [ParallelWorldClassloader](https://github.com/openjdk/jdk/blob/jdk8-b120/jaxws/src/share/jaxws_classes/com/sun/istack/internal/tools/ParallelWorldClassLoader.java)) -for each version of Spark supported in the jar, i.e., spark311/, spark312/, spark320/, etc. +for each version of Spark supported in the jar, i.e., spark320/, spark330/, spark341/, etc. If you have to change the contents of the uber jar the following files control what goes into the base jar as classes that are not shaded. -1. `unshimmed-common-from-spark311.txt` - This has classes and files that should go into the base jar with their normal +1. `unshimmed-common-from-spark320.txt` - This has classes and files that should go into the base jar with their normal package name (not shaded). This includes user visible classes (i.e., com/nvidia/spark/SQLPlugin), python files, -and other files that aren't version specific. Uses Spark 3.1.1 built jar for these base classes as explained above. +and other files that aren't version specific. Uses Spark 3.2.0 built jar for these base classes as explained above. 2. `unshimmed-from-each-spark3xx.txt` - This is applied to all the individual Spark specific version jars to pull any files that need to go into the base of the jar and not into the Spark specific directory. diff --git a/dist/build/package-parallel-worlds.py b/dist/build/package-parallel-worlds.py index 568ad5ca55f..ef64a4cd6bd 100644 --- a/dist/build/package-parallel-worlds.py +++ b/dist/build/package-parallel-worlds.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -73,8 +73,8 @@ def shell_exec(shell_cmd): shell_exec(mvn_cmd) dist_dir = os.sep.join([source_basedir, 'dist']) - with open(os.sep.join([dist_dir, 'unshimmed-common-from-spark311.txt']), 'r') as f: - from_spark311 = f.read().splitlines() + with open(os.sep.join([dist_dir, 'unshimmed-common-from-spark320.txt']), 'r') as f: + from_spark320 = f.read().splitlines() with open(os.sep.join([dist_dir, 'unshimmed-from-each-spark3xx.txt']), 'r') as f: from_each = f.read().splitlines() with zipfile.ZipFile(os.sep.join([deps_dir, art_jar]), 'r') as zip_handle: @@ -88,7 +88,7 @@ def shell_exec(shell_cmd): # TODO deprecate namelist = zip_handle.namelist() matching_members = [] - glob_list = from_spark311 + from_each if bv == buildver_list[0] else from_each + glob_list = from_spark320 + from_each if bv == buildver_list[0] else from_each for pat in glob_list: new_matches = fnmatch.filter(namelist, pat) matching_members += new_matches diff --git a/dist/maven-antrun/build-parallel-worlds.xml b/dist/maven-antrun/build-parallel-worlds.xml index 07838616340..bc4d7c9991c 100644 --- a/dist/maven-antrun/build-parallel-worlds.xml +++ b/dist/maven-antrun/build-parallel-worlds.xml @@ -132,7 +132,7 @@ + includesfile="${spark.rapids.source.basedir}/${rapids.module}/unshimmed-common-from-spark320.txt"/> diff --git a/dist/pom.xml b/dist/pom.xml index ecc2018baea..84103299bdc 100644 --- a/dist/pom.xml +++ b/dist/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.08.0-SNAPSHOT + 24.10.0-SNAPSHOT ../jdk-profiles/pom.xml rapids-4-spark_2.12 RAPIDS Accelerator for Apache Spark Distribution Creates the distribution package of the RAPIDS plugin for Apache Spark - 24.08.0-SNAPSHOT + 24.10.0-SNAPSHOT com.nvidia @@ -40,34 +40,50 @@ + + true dist ${project.build.directory}/${project.build.finalName}-${jni.classifier}.jar jar:file:${dist.jar.name}!/META-INF/maven/${project.groupId}/${project.artifactId}/pom.xml none false + none noSnapshotsWithDatabricks - - ${noSnapshot.buildvers}, - ${databricks.buildvers} - + false + no_snap_with_databricks + + + + snapshotsWithDatabricks + + false + snap_and_no_snap_with_databricks noSnapshots - - - ${noSnapshot.buildvers} - - - + false + no_snapshots + + + + snapshots + + false + snap_and_no_snap + + + + databricks + + false + databricks @@ -111,18 +127,18 @@ - jdk17-test + jdk17-scala213-test - ${jdk17.buildvers} + ${jdk17.scala213.buildvers} - databricks + jdk17-test - ${databricks.buildvers} + ${jdk17.buildvers} @@ -145,7 +161,6 @@ minimumFeatureVersionMix - 312, 320, 321cdh, 330, @@ -153,58 +168,6 @@ - - snapshots - - - ${noSnapshot.buildvers}, - ${snapshot.buildvers} - - - - - snapshotOnly - - - ${snapshot.buildvers} - - - - - snapshotsWithDatabricks - - - ${noSnapshot.buildvers}, - ${snapshot.buildvers}, - ${databricks.buildvers} - - - - - noSnapshotsScala213 - - - ${noSnapshotScala213.buildvers} - - - - - snapshotsScala213 - - - ${noSnapshotScala213.buildvers}, - ${snapshotScala213.buildvers} - - - - - snapshotScala213Only - - - ${snapshotScala213.buildvers} - - - individual @@ -389,7 +352,7 @@ self.log("... OK") - + @@ -555,6 +518,25 @@ self.log("... OK") + + org.apache.maven.plugins + maven-antrun-plugin + 3.1.0 + + + set-included-buidlvers + run + generate-sources + + ${dyn.shim.include.buildvers.skip} + +